xref: /freebsd/sys/kern/kern_umtx.c (revision 4d293dd8dcde59fc9842a0ce1125fef8fcf83a8c)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 
173 /*
174  * Don't propagate time-sharing priority, there is a security reason,
175  * a user can simply introduce PI-mutex, let thread A lock the mutex,
176  * and let another thread B block on the mutex, because B is
177  * sleeping, its priority will be boosted, this causes A's priority to
178  * be boosted via priority propagating too and will never be lowered even
179  * if it is using 100%CPU, this is unfair to other processes.
180  */
181 
182 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
183 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
184 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
185 
186 #define	GOLDEN_RATIO_PRIME	2654404609U
187 #define	UMTX_CHAINS		512
188 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
189 
190 #define	GET_SHARE(flags)	\
191     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
192 
193 #define BUSY_SPINS		200
194 
195 struct abs_timeout {
196 	int clockid;
197 	struct timespec cur;
198 	struct timespec end;
199 };
200 
201 static uma_zone_t		umtx_pi_zone;
202 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
203 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
204 static int			umtx_pi_allocated;
205 
206 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
207 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
208     &umtx_pi_allocated, 0, "Allocated umtx_pi");
209 
210 #ifdef UMTX_PROFILING
211 static long max_length;
212 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
213 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
214 #endif
215 
216 static void umtxq_sysinit(void *);
217 static void umtxq_hash(struct umtx_key *key);
218 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
219 static void umtxq_lock(struct umtx_key *key);
220 static void umtxq_unlock(struct umtx_key *key);
221 static void umtxq_busy(struct umtx_key *key);
222 static void umtxq_unbusy(struct umtx_key *key);
223 static void umtxq_insert_queue(struct umtx_q *uq, int q);
224 static void umtxq_remove_queue(struct umtx_q *uq, int q);
225 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
226 static int umtxq_count(struct umtx_key *key);
227 static struct umtx_pi *umtx_pi_alloc(int);
228 static void umtx_pi_free(struct umtx_pi *pi);
229 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
230 static void umtx_thread_cleanup(struct thread *td);
231 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
232 	struct image_params *imgp __unused);
233 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
234 
235 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
236 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
237 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
238 
239 static struct mtx umtx_lock;
240 
241 #ifdef UMTX_PROFILING
242 static void
243 umtx_init_profiling(void)
244 {
245 	struct sysctl_oid *chain_oid;
246 	char chain_name[10];
247 	int i;
248 
249 	for (i = 0; i < UMTX_CHAINS; ++i) {
250 		snprintf(chain_name, sizeof(chain_name), "%d", i);
251 		chain_oid = SYSCTL_ADD_NODE(NULL,
252 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
253 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
254 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
255 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
256 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
257 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
258 	}
259 }
260 
261 static int
262 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
263 {
264 	char buf[512];
265 	struct sbuf sb;
266 	struct umtxq_chain *uc;
267 	u_int fract, i, j, tot, whole;
268 	u_int sf0, sf1, sf2, sf3, sf4;
269 	u_int si0, si1, si2, si3, si4;
270 	u_int sw0, sw1, sw2, sw3, sw4;
271 
272 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
273 	for (i = 0; i < 2; i++) {
274 		tot = 0;
275 		for (j = 0; j < UMTX_CHAINS; ++j) {
276 			uc = &umtxq_chains[i][j];
277 			mtx_lock(&uc->uc_lock);
278 			tot += uc->max_length;
279 			mtx_unlock(&uc->uc_lock);
280 		}
281 		if (tot == 0)
282 			sbuf_printf(&sb, "%u) Empty ", i);
283 		else {
284 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
285 			si0 = si1 = si2 = si3 = si4 = 0;
286 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
287 			for (j = 0; j < UMTX_CHAINS; j++) {
288 				uc = &umtxq_chains[i][j];
289 				mtx_lock(&uc->uc_lock);
290 				whole = uc->max_length * 100;
291 				mtx_unlock(&uc->uc_lock);
292 				fract = (whole % tot) * 100;
293 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
294 					sf0 = fract;
295 					si0 = j;
296 					sw0 = whole;
297 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
298 				    sf1)) {
299 					sf1 = fract;
300 					si1 = j;
301 					sw1 = whole;
302 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
303 				    sf2)) {
304 					sf2 = fract;
305 					si2 = j;
306 					sw2 = whole;
307 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
308 				    sf3)) {
309 					sf3 = fract;
310 					si3 = j;
311 					sw3 = whole;
312 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
313 				    sf4)) {
314 					sf4 = fract;
315 					si4 = j;
316 					sw4 = whole;
317 				}
318 			}
319 			sbuf_printf(&sb, "queue %u:\n", i);
320 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
321 			    sf0 / tot, si0);
322 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
323 			    sf1 / tot, si1);
324 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
325 			    sf2 / tot, si2);
326 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
327 			    sf3 / tot, si3);
328 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
329 			    sf4 / tot, si4);
330 		}
331 	}
332 	sbuf_trim(&sb);
333 	sbuf_finish(&sb);
334 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
335 	sbuf_delete(&sb);
336 	return (0);
337 }
338 
339 static int
340 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
341 {
342 	struct umtxq_chain *uc;
343 	u_int i, j;
344 	int clear, error;
345 
346 	clear = 0;
347 	error = sysctl_handle_int(oidp, &clear, 0, req);
348 	if (error != 0 || req->newptr == NULL)
349 		return (error);
350 
351 	if (clear != 0) {
352 		for (i = 0; i < 2; ++i) {
353 			for (j = 0; j < UMTX_CHAINS; ++j) {
354 				uc = &umtxq_chains[i][j];
355 				mtx_lock(&uc->uc_lock);
356 				uc->length = 0;
357 				uc->max_length = 0;
358 				mtx_unlock(&uc->uc_lock);
359 			}
360 		}
361 	}
362 	return (0);
363 }
364 
365 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
366     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
367     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
368 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
369     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
370     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
371 #endif
372 
373 static void
374 umtxq_sysinit(void *arg __unused)
375 {
376 	int i, j;
377 
378 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
379 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
380 	for (i = 0; i < 2; ++i) {
381 		for (j = 0; j < UMTX_CHAINS; ++j) {
382 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
383 				 MTX_DEF | MTX_DUPOK);
384 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
387 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
388 			umtxq_chains[i][j].uc_busy = 0;
389 			umtxq_chains[i][j].uc_waiters = 0;
390 #ifdef UMTX_PROFILING
391 			umtxq_chains[i][j].length = 0;
392 			umtxq_chains[i][j].max_length = 0;
393 #endif
394 		}
395 	}
396 #ifdef UMTX_PROFILING
397 	umtx_init_profiling();
398 #endif
399 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
400 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
401 	    EVENTHANDLER_PRI_ANY);
402 }
403 
404 struct umtx_q *
405 umtxq_alloc(void)
406 {
407 	struct umtx_q *uq;
408 
409 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
410 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
411 	TAILQ_INIT(&uq->uq_spare_queue->head);
412 	TAILQ_INIT(&uq->uq_pi_contested);
413 	uq->uq_inherited_pri = PRI_MAX;
414 	return (uq);
415 }
416 
417 void
418 umtxq_free(struct umtx_q *uq)
419 {
420 	MPASS(uq->uq_spare_queue != NULL);
421 	free(uq->uq_spare_queue, M_UMTX);
422 	free(uq, M_UMTX);
423 }
424 
425 static inline void
426 umtxq_hash(struct umtx_key *key)
427 {
428 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
429 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
430 }
431 
432 static inline struct umtxq_chain *
433 umtxq_getchain(struct umtx_key *key)
434 {
435 	if (key->type <= TYPE_SEM)
436 		return (&umtxq_chains[1][key->hash]);
437 	return (&umtxq_chains[0][key->hash]);
438 }
439 
440 /*
441  * Lock a chain.
442  */
443 static inline void
444 umtxq_lock(struct umtx_key *key)
445 {
446 	struct umtxq_chain *uc;
447 
448 	uc = umtxq_getchain(key);
449 	mtx_lock(&uc->uc_lock);
450 }
451 
452 /*
453  * Unlock a chain.
454  */
455 static inline void
456 umtxq_unlock(struct umtx_key *key)
457 {
458 	struct umtxq_chain *uc;
459 
460 	uc = umtxq_getchain(key);
461 	mtx_unlock(&uc->uc_lock);
462 }
463 
464 /*
465  * Set chain to busy state when following operation
466  * may be blocked (kernel mutex can not be used).
467  */
468 static inline void
469 umtxq_busy(struct umtx_key *key)
470 {
471 	struct umtxq_chain *uc;
472 
473 	uc = umtxq_getchain(key);
474 	mtx_assert(&uc->uc_lock, MA_OWNED);
475 	if (uc->uc_busy) {
476 #ifdef SMP
477 		if (smp_cpus > 1) {
478 			int count = BUSY_SPINS;
479 			if (count > 0) {
480 				umtxq_unlock(key);
481 				while (uc->uc_busy && --count > 0)
482 					cpu_spinwait();
483 				umtxq_lock(key);
484 			}
485 		}
486 #endif
487 		while (uc->uc_busy) {
488 			uc->uc_waiters++;
489 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
490 			uc->uc_waiters--;
491 		}
492 	}
493 	uc->uc_busy = 1;
494 }
495 
496 /*
497  * Unbusy a chain.
498  */
499 static inline void
500 umtxq_unbusy(struct umtx_key *key)
501 {
502 	struct umtxq_chain *uc;
503 
504 	uc = umtxq_getchain(key);
505 	mtx_assert(&uc->uc_lock, MA_OWNED);
506 	KASSERT(uc->uc_busy != 0, ("not busy"));
507 	uc->uc_busy = 0;
508 	if (uc->uc_waiters)
509 		wakeup_one(uc);
510 }
511 
512 static inline void
513 umtxq_unbusy_unlocked(struct umtx_key *key)
514 {
515 
516 	umtxq_lock(key);
517 	umtxq_unbusy(key);
518 	umtxq_unlock(key);
519 }
520 
521 static struct umtxq_queue *
522 umtxq_queue_lookup(struct umtx_key *key, int q)
523 {
524 	struct umtxq_queue *uh;
525 	struct umtxq_chain *uc;
526 
527 	uc = umtxq_getchain(key);
528 	UMTXQ_LOCKED_ASSERT(uc);
529 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
530 		if (umtx_key_match(&uh->key, key))
531 			return (uh);
532 	}
533 
534 	return (NULL);
535 }
536 
537 static inline void
538 umtxq_insert_queue(struct umtx_q *uq, int q)
539 {
540 	struct umtxq_queue *uh;
541 	struct umtxq_chain *uc;
542 
543 	uc = umtxq_getchain(&uq->uq_key);
544 	UMTXQ_LOCKED_ASSERT(uc);
545 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
546 	uh = umtxq_queue_lookup(&uq->uq_key, q);
547 	if (uh != NULL) {
548 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
549 	} else {
550 		uh = uq->uq_spare_queue;
551 		uh->key = uq->uq_key;
552 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
553 #ifdef UMTX_PROFILING
554 		uc->length++;
555 		if (uc->length > uc->max_length) {
556 			uc->max_length = uc->length;
557 			if (uc->max_length > max_length)
558 				max_length = uc->max_length;
559 		}
560 #endif
561 	}
562 	uq->uq_spare_queue = NULL;
563 
564 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
565 	uh->length++;
566 	uq->uq_flags |= UQF_UMTXQ;
567 	uq->uq_cur_queue = uh;
568 	return;
569 }
570 
571 static inline void
572 umtxq_remove_queue(struct umtx_q *uq, int q)
573 {
574 	struct umtxq_chain *uc;
575 	struct umtxq_queue *uh;
576 
577 	uc = umtxq_getchain(&uq->uq_key);
578 	UMTXQ_LOCKED_ASSERT(uc);
579 	if (uq->uq_flags & UQF_UMTXQ) {
580 		uh = uq->uq_cur_queue;
581 		TAILQ_REMOVE(&uh->head, uq, uq_link);
582 		uh->length--;
583 		uq->uq_flags &= ~UQF_UMTXQ;
584 		if (TAILQ_EMPTY(&uh->head)) {
585 			KASSERT(uh->length == 0,
586 			    ("inconsistent umtxq_queue length"));
587 #ifdef UMTX_PROFILING
588 			uc->length--;
589 #endif
590 			LIST_REMOVE(uh, link);
591 		} else {
592 			uh = LIST_FIRST(&uc->uc_spare_queue);
593 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
594 			LIST_REMOVE(uh, link);
595 		}
596 		uq->uq_spare_queue = uh;
597 		uq->uq_cur_queue = NULL;
598 	}
599 }
600 
601 /*
602  * Check if there are multiple waiters
603  */
604 static int
605 umtxq_count(struct umtx_key *key)
606 {
607 	struct umtxq_chain *uc;
608 	struct umtxq_queue *uh;
609 
610 	uc = umtxq_getchain(key);
611 	UMTXQ_LOCKED_ASSERT(uc);
612 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
613 	if (uh != NULL)
614 		return (uh->length);
615 	return (0);
616 }
617 
618 /*
619  * Check if there are multiple PI waiters and returns first
620  * waiter.
621  */
622 static int
623 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
624 {
625 	struct umtxq_chain *uc;
626 	struct umtxq_queue *uh;
627 
628 	*first = NULL;
629 	uc = umtxq_getchain(key);
630 	UMTXQ_LOCKED_ASSERT(uc);
631 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
632 	if (uh != NULL) {
633 		*first = TAILQ_FIRST(&uh->head);
634 		return (uh->length);
635 	}
636 	return (0);
637 }
638 
639 static int
640 umtxq_check_susp(struct thread *td)
641 {
642 	struct proc *p;
643 	int error;
644 
645 	/*
646 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
647 	 * eventually break the lockstep loop.
648 	 */
649 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
650 		return (0);
651 	error = 0;
652 	p = td->td_proc;
653 	PROC_LOCK(p);
654 	if (P_SHOULDSTOP(p) ||
655 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
656 		if (p->p_flag & P_SINGLE_EXIT)
657 			error = EINTR;
658 		else
659 			error = ERESTART;
660 	}
661 	PROC_UNLOCK(p);
662 	return (error);
663 }
664 
665 /*
666  * Wake up threads waiting on an userland object.
667  */
668 
669 static int
670 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
671 {
672 	struct umtxq_chain *uc;
673 	struct umtxq_queue *uh;
674 	struct umtx_q *uq;
675 	int ret;
676 
677 	ret = 0;
678 	uc = umtxq_getchain(key);
679 	UMTXQ_LOCKED_ASSERT(uc);
680 	uh = umtxq_queue_lookup(key, q);
681 	if (uh != NULL) {
682 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
683 			umtxq_remove_queue(uq, q);
684 			wakeup(uq);
685 			if (++ret >= n_wake)
686 				return (ret);
687 		}
688 	}
689 	return (ret);
690 }
691 
692 
693 /*
694  * Wake up specified thread.
695  */
696 static inline void
697 umtxq_signal_thread(struct umtx_q *uq)
698 {
699 	struct umtxq_chain *uc;
700 
701 	uc = umtxq_getchain(&uq->uq_key);
702 	UMTXQ_LOCKED_ASSERT(uc);
703 	umtxq_remove(uq);
704 	wakeup(uq);
705 }
706 
707 static inline int
708 tstohz(const struct timespec *tsp)
709 {
710 	struct timeval tv;
711 
712 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
713 	return tvtohz(&tv);
714 }
715 
716 static void
717 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
718 	const struct timespec *timeout)
719 {
720 
721 	timo->clockid = clockid;
722 	if (!absolute) {
723 		kern_clock_gettime(curthread, clockid, &timo->end);
724 		timo->cur = timo->end;
725 		timespecadd(&timo->end, timeout);
726 	} else {
727 		timo->end = *timeout;
728 		kern_clock_gettime(curthread, clockid, &timo->cur);
729 	}
730 }
731 
732 static void
733 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
734 {
735 
736 	abs_timeout_init(timo, umtxtime->_clockid,
737 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
738 		&umtxtime->_timeout);
739 }
740 
741 static inline void
742 abs_timeout_update(struct abs_timeout *timo)
743 {
744 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
745 }
746 
747 static int
748 abs_timeout_gethz(struct abs_timeout *timo)
749 {
750 	struct timespec tts;
751 
752 	if (timespeccmp(&timo->end, &timo->cur, <=))
753 		return (-1);
754 	tts = timo->end;
755 	timespecsub(&tts, &timo->cur);
756 	return (tstohz(&tts));
757 }
758 
759 /*
760  * Put thread into sleep state, before sleeping, check if
761  * thread was removed from umtx queue.
762  */
763 static inline int
764 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
765 {
766 	struct umtxq_chain *uc;
767 	int error, timo;
768 
769 	uc = umtxq_getchain(&uq->uq_key);
770 	UMTXQ_LOCKED_ASSERT(uc);
771 	for (;;) {
772 		if (!(uq->uq_flags & UQF_UMTXQ))
773 			return (0);
774 		if (abstime != NULL) {
775 			timo = abs_timeout_gethz(abstime);
776 			if (timo < 0)
777 				return (ETIMEDOUT);
778 		} else
779 			timo = 0;
780 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
781 		if (error != EWOULDBLOCK) {
782 			umtxq_lock(&uq->uq_key);
783 			break;
784 		}
785 		if (abstime != NULL)
786 			abs_timeout_update(abstime);
787 		umtxq_lock(&uq->uq_key);
788 	}
789 	return (error);
790 }
791 
792 /*
793  * Convert userspace address into unique logical address.
794  */
795 int
796 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
797 {
798 	struct thread *td = curthread;
799 	vm_map_t map;
800 	vm_map_entry_t entry;
801 	vm_pindex_t pindex;
802 	vm_prot_t prot;
803 	boolean_t wired;
804 
805 	key->type = type;
806 	if (share == THREAD_SHARE) {
807 		key->shared = 0;
808 		key->info.private.vs = td->td_proc->p_vmspace;
809 		key->info.private.addr = (uintptr_t)addr;
810 	} else {
811 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
812 		map = &td->td_proc->p_vmspace->vm_map;
813 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
814 		    &entry, &key->info.shared.object, &pindex, &prot,
815 		    &wired) != KERN_SUCCESS) {
816 			return (EFAULT);
817 		}
818 
819 		if ((share == PROCESS_SHARE) ||
820 		    (share == AUTO_SHARE &&
821 		     VM_INHERIT_SHARE == entry->inheritance)) {
822 			key->shared = 1;
823 			key->info.shared.offset = (vm_offset_t)addr -
824 			    entry->start + entry->offset;
825 			vm_object_reference(key->info.shared.object);
826 		} else {
827 			key->shared = 0;
828 			key->info.private.vs = td->td_proc->p_vmspace;
829 			key->info.private.addr = (uintptr_t)addr;
830 		}
831 		vm_map_lookup_done(map, entry);
832 	}
833 
834 	umtxq_hash(key);
835 	return (0);
836 }
837 
838 /*
839  * Release key.
840  */
841 void
842 umtx_key_release(struct umtx_key *key)
843 {
844 	if (key->shared)
845 		vm_object_deallocate(key->info.shared.object);
846 }
847 
848 /*
849  * Fetch and compare value, sleep on the address if value is not changed.
850  */
851 static int
852 do_wait(struct thread *td, void *addr, u_long id,
853 	struct _umtx_time *timeout, int compat32, int is_private)
854 {
855 	struct abs_timeout timo;
856 	struct umtx_q *uq;
857 	u_long tmp;
858 	uint32_t tmp32;
859 	int error = 0;
860 
861 	uq = td->td_umtxq;
862 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
863 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
864 		return (error);
865 
866 	if (timeout != NULL)
867 		abs_timeout_init2(&timo, timeout);
868 
869 	umtxq_lock(&uq->uq_key);
870 	umtxq_insert(uq);
871 	umtxq_unlock(&uq->uq_key);
872 	if (compat32 == 0) {
873 		error = fueword(addr, &tmp);
874 		if (error != 0)
875 			error = EFAULT;
876 	} else {
877 		error = fueword32(addr, &tmp32);
878 		if (error == 0)
879 			tmp = tmp32;
880 		else
881 			error = EFAULT;
882 	}
883 	umtxq_lock(&uq->uq_key);
884 	if (error == 0) {
885 		if (tmp == id)
886 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
887 			    NULL : &timo);
888 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
889 			error = 0;
890 		else
891 			umtxq_remove(uq);
892 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
893 		umtxq_remove(uq);
894 	}
895 	umtxq_unlock(&uq->uq_key);
896 	umtx_key_release(&uq->uq_key);
897 	if (error == ERESTART)
898 		error = EINTR;
899 	return (error);
900 }
901 
902 /*
903  * Wake up threads sleeping on the specified address.
904  */
905 int
906 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
907 {
908 	struct umtx_key key;
909 	int ret;
910 
911 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
912 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
913 		return (ret);
914 	umtxq_lock(&key);
915 	umtxq_signal(&key, n_wake);
916 	umtxq_unlock(&key);
917 	umtx_key_release(&key);
918 	return (0);
919 }
920 
921 /*
922  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
923  */
924 static int
925 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
926 	struct _umtx_time *timeout, int mode)
927 {
928 	struct abs_timeout timo;
929 	struct umtx_q *uq;
930 	uint32_t owner, old, id;
931 	int error, rv;
932 
933 	id = td->td_tid;
934 	uq = td->td_umtxq;
935 	error = 0;
936 	if (timeout != NULL)
937 		abs_timeout_init2(&timo, timeout);
938 
939 	/*
940 	 * Care must be exercised when dealing with umtx structure. It
941 	 * can fault on any access.
942 	 */
943 	for (;;) {
944 		rv = fueword32(&m->m_owner, &owner);
945 		if (rv == -1)
946 			return (EFAULT);
947 		if (mode == _UMUTEX_WAIT) {
948 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
949 				return (0);
950 		} else {
951 			/*
952 			 * Try the uncontested case.  This should be done in userland.
953 			 */
954 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
955 			    &owner, id);
956 			/* The address was invalid. */
957 			if (rv == -1)
958 				return (EFAULT);
959 
960 			/* The acquire succeeded. */
961 			if (owner == UMUTEX_UNOWNED)
962 				return (0);
963 
964 			/* If no one owns it but it is contested try to acquire it. */
965 			if (owner == UMUTEX_CONTESTED) {
966 				rv = casueword32(&m->m_owner,
967 				    UMUTEX_CONTESTED, &owner,
968 				    id | UMUTEX_CONTESTED);
969 				/* The address was invalid. */
970 				if (rv == -1)
971 					return (EFAULT);
972 
973 				if (owner == UMUTEX_CONTESTED)
974 					return (0);
975 
976 				rv = umtxq_check_susp(td);
977 				if (rv != 0)
978 					return (rv);
979 
980 				/* If this failed the lock has changed, restart. */
981 				continue;
982 			}
983 		}
984 
985 		if (mode == _UMUTEX_TRY)
986 			return (EBUSY);
987 
988 		/*
989 		 * If we caught a signal, we have retried and now
990 		 * exit immediately.
991 		 */
992 		if (error != 0)
993 			return (error);
994 
995 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
996 		    GET_SHARE(flags), &uq->uq_key)) != 0)
997 			return (error);
998 
999 		umtxq_lock(&uq->uq_key);
1000 		umtxq_busy(&uq->uq_key);
1001 		umtxq_insert(uq);
1002 		umtxq_unlock(&uq->uq_key);
1003 
1004 		/*
1005 		 * Set the contested bit so that a release in user space
1006 		 * knows to use the system call for unlock.  If this fails
1007 		 * either some one else has acquired the lock or it has been
1008 		 * released.
1009 		 */
1010 		rv = casueword32(&m->m_owner, owner, &old,
1011 		    owner | UMUTEX_CONTESTED);
1012 
1013 		/* The address was invalid. */
1014 		if (rv == -1) {
1015 			umtxq_lock(&uq->uq_key);
1016 			umtxq_remove(uq);
1017 			umtxq_unbusy(&uq->uq_key);
1018 			umtxq_unlock(&uq->uq_key);
1019 			umtx_key_release(&uq->uq_key);
1020 			return (EFAULT);
1021 		}
1022 
1023 		/*
1024 		 * We set the contested bit, sleep. Otherwise the lock changed
1025 		 * and we need to retry or we lost a race to the thread
1026 		 * unlocking the umtx.
1027 		 */
1028 		umtxq_lock(&uq->uq_key);
1029 		umtxq_unbusy(&uq->uq_key);
1030 		if (old == owner)
1031 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1032 			    NULL : &timo);
1033 		umtxq_remove(uq);
1034 		umtxq_unlock(&uq->uq_key);
1035 		umtx_key_release(&uq->uq_key);
1036 
1037 		if (error == 0)
1038 			error = umtxq_check_susp(td);
1039 	}
1040 
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1046  */
1047 static int
1048 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1049 {
1050 	struct umtx_key key;
1051 	uint32_t owner, old, id;
1052 	int error;
1053 	int count;
1054 
1055 	id = td->td_tid;
1056 	/*
1057 	 * Make sure we own this mtx.
1058 	 */
1059 	error = fueword32(&m->m_owner, &owner);
1060 	if (error == -1)
1061 		return (EFAULT);
1062 
1063 	if ((owner & ~UMUTEX_CONTESTED) != id)
1064 		return (EPERM);
1065 
1066 	if ((owner & UMUTEX_CONTESTED) == 0) {
1067 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1068 		if (error == -1)
1069 			return (EFAULT);
1070 		if (old == owner)
1071 			return (0);
1072 		owner = old;
1073 	}
1074 
1075 	/* We should only ever be in here for contested locks */
1076 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1077 	    &key)) != 0)
1078 		return (error);
1079 
1080 	umtxq_lock(&key);
1081 	umtxq_busy(&key);
1082 	count = umtxq_count(&key);
1083 	umtxq_unlock(&key);
1084 
1085 	/*
1086 	 * When unlocking the umtx, it must be marked as unowned if
1087 	 * there is zero or one thread only waiting for it.
1088 	 * Otherwise, it must be marked as contested.
1089 	 */
1090 	error = casueword32(&m->m_owner, owner, &old,
1091 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1092 	umtxq_lock(&key);
1093 	umtxq_signal(&key,1);
1094 	umtxq_unbusy(&key);
1095 	umtxq_unlock(&key);
1096 	umtx_key_release(&key);
1097 	if (error == -1)
1098 		return (EFAULT);
1099 	if (old != owner)
1100 		return (EINVAL);
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Check if the mutex is available and wake up a waiter,
1106  * only for simple mutex.
1107  */
1108 static int
1109 do_wake_umutex(struct thread *td, struct umutex *m)
1110 {
1111 	struct umtx_key key;
1112 	uint32_t owner;
1113 	uint32_t flags;
1114 	int error;
1115 	int count;
1116 
1117 	error = fueword32(&m->m_owner, &owner);
1118 	if (error == -1)
1119 		return (EFAULT);
1120 
1121 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1122 		return (0);
1123 
1124 	error = fueword32(&m->m_flags, &flags);
1125 	if (error == -1)
1126 		return (EFAULT);
1127 
1128 	/* We should only ever be in here for contested locks */
1129 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1130 	    &key)) != 0)
1131 		return (error);
1132 
1133 	umtxq_lock(&key);
1134 	umtxq_busy(&key);
1135 	count = umtxq_count(&key);
1136 	umtxq_unlock(&key);
1137 
1138 	if (count <= 1) {
1139 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1140 		    UMUTEX_UNOWNED);
1141 		if (error == -1)
1142 			error = EFAULT;
1143 	}
1144 
1145 	umtxq_lock(&key);
1146 	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1147 		umtxq_signal(&key, 1);
1148 	umtxq_unbusy(&key);
1149 	umtxq_unlock(&key);
1150 	umtx_key_release(&key);
1151 	return (error);
1152 }
1153 
1154 /*
1155  * Check if the mutex has waiters and tries to fix contention bit.
1156  */
1157 static int
1158 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1159 {
1160 	struct umtx_key key;
1161 	uint32_t owner, old;
1162 	int type;
1163 	int error;
1164 	int count;
1165 
1166 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1167 	case 0:
1168 		type = TYPE_NORMAL_UMUTEX;
1169 		break;
1170 	case UMUTEX_PRIO_INHERIT:
1171 		type = TYPE_PI_UMUTEX;
1172 		break;
1173 	case UMUTEX_PRIO_PROTECT:
1174 		type = TYPE_PP_UMUTEX;
1175 		break;
1176 	default:
1177 		return (EINVAL);
1178 	}
1179 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1180 	    &key)) != 0)
1181 		return (error);
1182 
1183 	owner = 0;
1184 	umtxq_lock(&key);
1185 	umtxq_busy(&key);
1186 	count = umtxq_count(&key);
1187 	umtxq_unlock(&key);
1188 	/*
1189 	 * Only repair contention bit if there is a waiter, this means the mutex
1190 	 * is still being referenced by userland code, otherwise don't update
1191 	 * any memory.
1192 	 */
1193 	if (count > 1) {
1194 		error = fueword32(&m->m_owner, &owner);
1195 		if (error == -1)
1196 			error = EFAULT;
1197 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1198 			error = casueword32(&m->m_owner, owner, &old,
1199 			    owner | UMUTEX_CONTESTED);
1200 			if (error == -1) {
1201 				error = EFAULT;
1202 				break;
1203 			}
1204 			if (old == owner)
1205 				break;
1206 			owner = old;
1207 			error = umtxq_check_susp(td);
1208 			if (error != 0)
1209 				break;
1210 		}
1211 	} else if (count == 1) {
1212 		error = fueword32(&m->m_owner, &owner);
1213 		if (error == -1)
1214 			error = EFAULT;
1215 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1216 		       (owner & UMUTEX_CONTESTED) == 0) {
1217 			error = casueword32(&m->m_owner, owner, &old,
1218 			    owner | UMUTEX_CONTESTED);
1219 			if (error == -1) {
1220 				error = EFAULT;
1221 				break;
1222 			}
1223 			if (old == owner)
1224 				break;
1225 			owner = old;
1226 			error = umtxq_check_susp(td);
1227 			if (error != 0)
1228 				break;
1229 		}
1230 	}
1231 	umtxq_lock(&key);
1232 	if (error == EFAULT) {
1233 		umtxq_signal(&key, INT_MAX);
1234 	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1235 		umtxq_signal(&key, 1);
1236 	umtxq_unbusy(&key);
1237 	umtxq_unlock(&key);
1238 	umtx_key_release(&key);
1239 	return (error);
1240 }
1241 
1242 static inline struct umtx_pi *
1243 umtx_pi_alloc(int flags)
1244 {
1245 	struct umtx_pi *pi;
1246 
1247 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1248 	TAILQ_INIT(&pi->pi_blocked);
1249 	atomic_add_int(&umtx_pi_allocated, 1);
1250 	return (pi);
1251 }
1252 
1253 static inline void
1254 umtx_pi_free(struct umtx_pi *pi)
1255 {
1256 	uma_zfree(umtx_pi_zone, pi);
1257 	atomic_add_int(&umtx_pi_allocated, -1);
1258 }
1259 
1260 /*
1261  * Adjust the thread's position on a pi_state after its priority has been
1262  * changed.
1263  */
1264 static int
1265 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1266 {
1267 	struct umtx_q *uq, *uq1, *uq2;
1268 	struct thread *td1;
1269 
1270 	mtx_assert(&umtx_lock, MA_OWNED);
1271 	if (pi == NULL)
1272 		return (0);
1273 
1274 	uq = td->td_umtxq;
1275 
1276 	/*
1277 	 * Check if the thread needs to be moved on the blocked chain.
1278 	 * It needs to be moved if either its priority is lower than
1279 	 * the previous thread or higher than the next thread.
1280 	 */
1281 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1282 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1283 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1284 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1285 		/*
1286 		 * Remove thread from blocked chain and determine where
1287 		 * it should be moved to.
1288 		 */
1289 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1290 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1291 			td1 = uq1->uq_thread;
1292 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1293 			if (UPRI(td1) > UPRI(td))
1294 				break;
1295 		}
1296 
1297 		if (uq1 == NULL)
1298 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1299 		else
1300 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1301 	}
1302 	return (1);
1303 }
1304 
1305 static struct umtx_pi *
1306 umtx_pi_next(struct umtx_pi *pi)
1307 {
1308 	struct umtx_q *uq_owner;
1309 
1310 	if (pi->pi_owner == NULL)
1311 		return (NULL);
1312 	uq_owner = pi->pi_owner->td_umtxq;
1313 	if (uq_owner == NULL)
1314 		return (NULL);
1315 	return (uq_owner->uq_pi_blocked);
1316 }
1317 
1318 /*
1319  * Floyd's Cycle-Finding Algorithm.
1320  */
1321 static bool
1322 umtx_pi_check_loop(struct umtx_pi *pi)
1323 {
1324 	struct umtx_pi *pi1;	/* fast iterator */
1325 
1326 	mtx_assert(&umtx_lock, MA_OWNED);
1327 	if (pi == NULL)
1328 		return (false);
1329 	pi1 = pi;
1330 	for (;;) {
1331 		pi = umtx_pi_next(pi);
1332 		if (pi == NULL)
1333 			break;
1334 		pi1 = umtx_pi_next(pi1);
1335 		if (pi1 == NULL)
1336 			break;
1337 		pi1 = umtx_pi_next(pi1);
1338 		if (pi1 == NULL)
1339 			break;
1340 		if (pi == pi1)
1341 			return (true);
1342 	}
1343 	return (false);
1344 }
1345 
1346 /*
1347  * Propagate priority when a thread is blocked on POSIX
1348  * PI mutex.
1349  */
1350 static void
1351 umtx_propagate_priority(struct thread *td)
1352 {
1353 	struct umtx_q *uq;
1354 	struct umtx_pi *pi;
1355 	int pri;
1356 
1357 	mtx_assert(&umtx_lock, MA_OWNED);
1358 	pri = UPRI(td);
1359 	uq = td->td_umtxq;
1360 	pi = uq->uq_pi_blocked;
1361 	if (pi == NULL)
1362 		return;
1363 	if (umtx_pi_check_loop(pi))
1364 		return;
1365 
1366 	for (;;) {
1367 		td = pi->pi_owner;
1368 		if (td == NULL || td == curthread)
1369 			return;
1370 
1371 		MPASS(td->td_proc != NULL);
1372 		MPASS(td->td_proc->p_magic == P_MAGIC);
1373 
1374 		thread_lock(td);
1375 		if (td->td_lend_user_pri > pri)
1376 			sched_lend_user_prio(td, pri);
1377 		else {
1378 			thread_unlock(td);
1379 			break;
1380 		}
1381 		thread_unlock(td);
1382 
1383 		/*
1384 		 * Pick up the lock that td is blocked on.
1385 		 */
1386 		uq = td->td_umtxq;
1387 		pi = uq->uq_pi_blocked;
1388 		if (pi == NULL)
1389 			break;
1390 		/* Resort td on the list if needed. */
1391 		umtx_pi_adjust_thread(pi, td);
1392 	}
1393 }
1394 
1395 /*
1396  * Unpropagate priority for a PI mutex when a thread blocked on
1397  * it is interrupted by signal or resumed by others.
1398  */
1399 static void
1400 umtx_repropagate_priority(struct umtx_pi *pi)
1401 {
1402 	struct umtx_q *uq, *uq_owner;
1403 	struct umtx_pi *pi2;
1404 	int pri;
1405 
1406 	mtx_assert(&umtx_lock, MA_OWNED);
1407 
1408 	if (umtx_pi_check_loop(pi))
1409 		return;
1410 	while (pi != NULL && pi->pi_owner != NULL) {
1411 		pri = PRI_MAX;
1412 		uq_owner = pi->pi_owner->td_umtxq;
1413 
1414 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1415 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1416 			if (uq != NULL) {
1417 				if (pri > UPRI(uq->uq_thread))
1418 					pri = UPRI(uq->uq_thread);
1419 			}
1420 		}
1421 
1422 		if (pri > uq_owner->uq_inherited_pri)
1423 			pri = uq_owner->uq_inherited_pri;
1424 		thread_lock(pi->pi_owner);
1425 		sched_lend_user_prio(pi->pi_owner, pri);
1426 		thread_unlock(pi->pi_owner);
1427 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1428 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1429 	}
1430 }
1431 
1432 /*
1433  * Insert a PI mutex into owned list.
1434  */
1435 static void
1436 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1437 {
1438 	struct umtx_q *uq_owner;
1439 
1440 	uq_owner = owner->td_umtxq;
1441 	mtx_assert(&umtx_lock, MA_OWNED);
1442 	if (pi->pi_owner != NULL)
1443 		panic("pi_owner != NULL");
1444 	pi->pi_owner = owner;
1445 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1446 }
1447 
1448 
1449 /*
1450  * Disown a PI mutex, and remove it from the owned list.
1451  */
1452 static void
1453 umtx_pi_disown(struct umtx_pi *pi)
1454 {
1455 
1456 	mtx_assert(&umtx_lock, MA_OWNED);
1457 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1458 	pi->pi_owner = NULL;
1459 }
1460 
1461 /*
1462  * Claim ownership of a PI mutex.
1463  */
1464 static int
1465 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1466 {
1467 	struct umtx_q *uq;
1468 
1469 	mtx_lock(&umtx_lock);
1470 	if (pi->pi_owner == owner) {
1471 		mtx_unlock(&umtx_lock);
1472 		return (0);
1473 	}
1474 
1475 	if (pi->pi_owner != NULL) {
1476 		/*
1477 		 * userland may have already messed the mutex, sigh.
1478 		 */
1479 		mtx_unlock(&umtx_lock);
1480 		return (EPERM);
1481 	}
1482 	umtx_pi_setowner(pi, owner);
1483 	uq = TAILQ_FIRST(&pi->pi_blocked);
1484 	if (uq != NULL) {
1485 		int pri;
1486 
1487 		pri = UPRI(uq->uq_thread);
1488 		thread_lock(owner);
1489 		if (pri < UPRI(owner))
1490 			sched_lend_user_prio(owner, pri);
1491 		thread_unlock(owner);
1492 	}
1493 	mtx_unlock(&umtx_lock);
1494 	return (0);
1495 }
1496 
1497 /*
1498  * Adjust a thread's order position in its blocked PI mutex,
1499  * this may result new priority propagating process.
1500  */
1501 void
1502 umtx_pi_adjust(struct thread *td, u_char oldpri)
1503 {
1504 	struct umtx_q *uq;
1505 	struct umtx_pi *pi;
1506 
1507 	uq = td->td_umtxq;
1508 	mtx_lock(&umtx_lock);
1509 	/*
1510 	 * Pick up the lock that td is blocked on.
1511 	 */
1512 	pi = uq->uq_pi_blocked;
1513 	if (pi != NULL) {
1514 		umtx_pi_adjust_thread(pi, td);
1515 		umtx_repropagate_priority(pi);
1516 	}
1517 	mtx_unlock(&umtx_lock);
1518 }
1519 
1520 /*
1521  * Sleep on a PI mutex.
1522  */
1523 static int
1524 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1525 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1526 {
1527 	struct umtxq_chain *uc;
1528 	struct thread *td, *td1;
1529 	struct umtx_q *uq1;
1530 	int pri;
1531 	int error = 0;
1532 
1533 	td = uq->uq_thread;
1534 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1535 	uc = umtxq_getchain(&uq->uq_key);
1536 	UMTXQ_LOCKED_ASSERT(uc);
1537 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1538 	umtxq_insert(uq);
1539 	mtx_lock(&umtx_lock);
1540 	if (pi->pi_owner == NULL) {
1541 		mtx_unlock(&umtx_lock);
1542 		/* XXX Only look up thread in current process. */
1543 		td1 = tdfind(owner, curproc->p_pid);
1544 		mtx_lock(&umtx_lock);
1545 		if (td1 != NULL) {
1546 			if (pi->pi_owner == NULL)
1547 				umtx_pi_setowner(pi, td1);
1548 			PROC_UNLOCK(td1->td_proc);
1549 		}
1550 	}
1551 
1552 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1553 		pri = UPRI(uq1->uq_thread);
1554 		if (pri > UPRI(td))
1555 			break;
1556 	}
1557 
1558 	if (uq1 != NULL)
1559 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1560 	else
1561 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1562 
1563 	uq->uq_pi_blocked = pi;
1564 	thread_lock(td);
1565 	td->td_flags |= TDF_UPIBLOCKED;
1566 	thread_unlock(td);
1567 	umtx_propagate_priority(td);
1568 	mtx_unlock(&umtx_lock);
1569 	umtxq_unbusy(&uq->uq_key);
1570 
1571 	error = umtxq_sleep(uq, wmesg, timo);
1572 	umtxq_remove(uq);
1573 
1574 	mtx_lock(&umtx_lock);
1575 	uq->uq_pi_blocked = NULL;
1576 	thread_lock(td);
1577 	td->td_flags &= ~TDF_UPIBLOCKED;
1578 	thread_unlock(td);
1579 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1580 	umtx_repropagate_priority(pi);
1581 	mtx_unlock(&umtx_lock);
1582 	umtxq_unlock(&uq->uq_key);
1583 
1584 	return (error);
1585 }
1586 
1587 /*
1588  * Add reference count for a PI mutex.
1589  */
1590 static void
1591 umtx_pi_ref(struct umtx_pi *pi)
1592 {
1593 	struct umtxq_chain *uc;
1594 
1595 	uc = umtxq_getchain(&pi->pi_key);
1596 	UMTXQ_LOCKED_ASSERT(uc);
1597 	pi->pi_refcount++;
1598 }
1599 
1600 /*
1601  * Decrease reference count for a PI mutex, if the counter
1602  * is decreased to zero, its memory space is freed.
1603  */
1604 static void
1605 umtx_pi_unref(struct umtx_pi *pi)
1606 {
1607 	struct umtxq_chain *uc;
1608 
1609 	uc = umtxq_getchain(&pi->pi_key);
1610 	UMTXQ_LOCKED_ASSERT(uc);
1611 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1612 	if (--pi->pi_refcount == 0) {
1613 		mtx_lock(&umtx_lock);
1614 		if (pi->pi_owner != NULL)
1615 			umtx_pi_disown(pi);
1616 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1617 			("blocked queue not empty"));
1618 		mtx_unlock(&umtx_lock);
1619 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1620 		umtx_pi_free(pi);
1621 	}
1622 }
1623 
1624 /*
1625  * Find a PI mutex in hash table.
1626  */
1627 static struct umtx_pi *
1628 umtx_pi_lookup(struct umtx_key *key)
1629 {
1630 	struct umtxq_chain *uc;
1631 	struct umtx_pi *pi;
1632 
1633 	uc = umtxq_getchain(key);
1634 	UMTXQ_LOCKED_ASSERT(uc);
1635 
1636 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1637 		if (umtx_key_match(&pi->pi_key, key)) {
1638 			return (pi);
1639 		}
1640 	}
1641 	return (NULL);
1642 }
1643 
1644 /*
1645  * Insert a PI mutex into hash table.
1646  */
1647 static inline void
1648 umtx_pi_insert(struct umtx_pi *pi)
1649 {
1650 	struct umtxq_chain *uc;
1651 
1652 	uc = umtxq_getchain(&pi->pi_key);
1653 	UMTXQ_LOCKED_ASSERT(uc);
1654 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1655 }
1656 
1657 /*
1658  * Lock a PI mutex.
1659  */
1660 static int
1661 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1662     struct _umtx_time *timeout, int try)
1663 {
1664 	struct abs_timeout timo;
1665 	struct umtx_q *uq;
1666 	struct umtx_pi *pi, *new_pi;
1667 	uint32_t id, owner, old;
1668 	int error, rv;
1669 
1670 	id = td->td_tid;
1671 	uq = td->td_umtxq;
1672 
1673 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1674 	    &uq->uq_key)) != 0)
1675 		return (error);
1676 
1677 	if (timeout != NULL)
1678 		abs_timeout_init2(&timo, timeout);
1679 
1680 	umtxq_lock(&uq->uq_key);
1681 	pi = umtx_pi_lookup(&uq->uq_key);
1682 	if (pi == NULL) {
1683 		new_pi = umtx_pi_alloc(M_NOWAIT);
1684 		if (new_pi == NULL) {
1685 			umtxq_unlock(&uq->uq_key);
1686 			new_pi = umtx_pi_alloc(M_WAITOK);
1687 			umtxq_lock(&uq->uq_key);
1688 			pi = umtx_pi_lookup(&uq->uq_key);
1689 			if (pi != NULL) {
1690 				umtx_pi_free(new_pi);
1691 				new_pi = NULL;
1692 			}
1693 		}
1694 		if (new_pi != NULL) {
1695 			new_pi->pi_key = uq->uq_key;
1696 			umtx_pi_insert(new_pi);
1697 			pi = new_pi;
1698 		}
1699 	}
1700 	umtx_pi_ref(pi);
1701 	umtxq_unlock(&uq->uq_key);
1702 
1703 	/*
1704 	 * Care must be exercised when dealing with umtx structure.  It
1705 	 * can fault on any access.
1706 	 */
1707 	for (;;) {
1708 		/*
1709 		 * Try the uncontested case.  This should be done in userland.
1710 		 */
1711 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1712 		/* The address was invalid. */
1713 		if (rv == -1) {
1714 			error = EFAULT;
1715 			break;
1716 		}
1717 
1718 		/* The acquire succeeded. */
1719 		if (owner == UMUTEX_UNOWNED) {
1720 			error = 0;
1721 			break;
1722 		}
1723 
1724 		/* If no one owns it but it is contested try to acquire it. */
1725 		if (owner == UMUTEX_CONTESTED) {
1726 			rv = casueword32(&m->m_owner,
1727 			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1728 			/* The address was invalid. */
1729 			if (rv == -1) {
1730 				error = EFAULT;
1731 				break;
1732 			}
1733 
1734 			if (owner == UMUTEX_CONTESTED) {
1735 				umtxq_lock(&uq->uq_key);
1736 				umtxq_busy(&uq->uq_key);
1737 				error = umtx_pi_claim(pi, td);
1738 				umtxq_unbusy(&uq->uq_key);
1739 				umtxq_unlock(&uq->uq_key);
1740 				if (error != 0) {
1741 					/*
1742 					 * Since we're going to return an
1743 					 * error, restore the m_owner to its
1744 					 * previous, unowned state to avoid
1745 					 * compounding the problem.
1746 					 */
1747 					(void)casuword32(&m->m_owner,
1748 					    id | UMUTEX_CONTESTED,
1749 					    UMUTEX_CONTESTED);
1750 				}
1751 				break;
1752 			}
1753 
1754 			error = umtxq_check_susp(td);
1755 			if (error != 0)
1756 				break;
1757 
1758 			/* If this failed the lock has changed, restart. */
1759 			continue;
1760 		}
1761 
1762 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1763 			error = EDEADLK;
1764 			break;
1765 		}
1766 
1767 		if (try != 0) {
1768 			error = EBUSY;
1769 			break;
1770 		}
1771 
1772 		/*
1773 		 * If we caught a signal, we have retried and now
1774 		 * exit immediately.
1775 		 */
1776 		if (error != 0)
1777 			break;
1778 
1779 		umtxq_lock(&uq->uq_key);
1780 		umtxq_busy(&uq->uq_key);
1781 		umtxq_unlock(&uq->uq_key);
1782 
1783 		/*
1784 		 * Set the contested bit so that a release in user space
1785 		 * knows to use the system call for unlock.  If this fails
1786 		 * either some one else has acquired the lock or it has been
1787 		 * released.
1788 		 */
1789 		rv = casueword32(&m->m_owner, owner, &old,
1790 		    owner | UMUTEX_CONTESTED);
1791 
1792 		/* The address was invalid. */
1793 		if (rv == -1) {
1794 			umtxq_unbusy_unlocked(&uq->uq_key);
1795 			error = EFAULT;
1796 			break;
1797 		}
1798 
1799 		umtxq_lock(&uq->uq_key);
1800 		/*
1801 		 * We set the contested bit, sleep. Otherwise the lock changed
1802 		 * and we need to retry or we lost a race to the thread
1803 		 * unlocking the umtx.
1804 		 */
1805 		if (old == owner) {
1806 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1807 			    "umtxpi", timeout == NULL ? NULL : &timo);
1808 			if (error != 0)
1809 				continue;
1810 		} else {
1811 			umtxq_unbusy(&uq->uq_key);
1812 			umtxq_unlock(&uq->uq_key);
1813 		}
1814 
1815 		error = umtxq_check_susp(td);
1816 		if (error != 0)
1817 			break;
1818 	}
1819 
1820 	umtxq_lock(&uq->uq_key);
1821 	umtx_pi_unref(pi);
1822 	umtxq_unlock(&uq->uq_key);
1823 
1824 	umtx_key_release(&uq->uq_key);
1825 	return (error);
1826 }
1827 
1828 /*
1829  * Unlock a PI mutex.
1830  */
1831 static int
1832 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1833 {
1834 	struct umtx_key key;
1835 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1836 	struct umtx_pi *pi, *pi2;
1837 	uint32_t owner, old, id;
1838 	int error;
1839 	int count;
1840 	int pri;
1841 
1842 	id = td->td_tid;
1843 	/*
1844 	 * Make sure we own this mtx.
1845 	 */
1846 	error = fueword32(&m->m_owner, &owner);
1847 	if (error == -1)
1848 		return (EFAULT);
1849 
1850 	if ((owner & ~UMUTEX_CONTESTED) != id)
1851 		return (EPERM);
1852 
1853 	/* This should be done in userland */
1854 	if ((owner & UMUTEX_CONTESTED) == 0) {
1855 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1856 		if (error == -1)
1857 			return (EFAULT);
1858 		if (old == owner)
1859 			return (0);
1860 		owner = old;
1861 	}
1862 
1863 	/* We should only ever be in here for contested locks */
1864 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1865 	    &key)) != 0)
1866 		return (error);
1867 
1868 	umtxq_lock(&key);
1869 	umtxq_busy(&key);
1870 	count = umtxq_count_pi(&key, &uq_first);
1871 	if (uq_first != NULL) {
1872 		mtx_lock(&umtx_lock);
1873 		pi = uq_first->uq_pi_blocked;
1874 		KASSERT(pi != NULL, ("pi == NULL?"));
1875 		if (pi->pi_owner != td) {
1876 			mtx_unlock(&umtx_lock);
1877 			umtxq_unbusy(&key);
1878 			umtxq_unlock(&key);
1879 			umtx_key_release(&key);
1880 			/* userland messed the mutex */
1881 			return (EPERM);
1882 		}
1883 		uq_me = td->td_umtxq;
1884 		umtx_pi_disown(pi);
1885 		/* get highest priority thread which is still sleeping. */
1886 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1887 		while (uq_first != NULL &&
1888 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1889 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1890 		}
1891 		pri = PRI_MAX;
1892 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1893 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1894 			if (uq_first2 != NULL) {
1895 				if (pri > UPRI(uq_first2->uq_thread))
1896 					pri = UPRI(uq_first2->uq_thread);
1897 			}
1898 		}
1899 		thread_lock(td);
1900 		sched_lend_user_prio(td, pri);
1901 		thread_unlock(td);
1902 		mtx_unlock(&umtx_lock);
1903 		if (uq_first)
1904 			umtxq_signal_thread(uq_first);
1905 	} else {
1906 		pi = umtx_pi_lookup(&key);
1907 		/*
1908 		 * A umtx_pi can exist if a signal or timeout removed the
1909 		 * last waiter from the umtxq, but there is still
1910 		 * a thread in do_lock_pi() holding the umtx_pi.
1911 		 */
1912 		if (pi != NULL) {
1913 			/*
1914 			 * The umtx_pi can be unowned, such as when a thread
1915 			 * has just entered do_lock_pi(), allocated the
1916 			 * umtx_pi, and unlocked the umtxq.
1917 			 * If the current thread owns it, it must disown it.
1918 			 */
1919 			mtx_lock(&umtx_lock);
1920 			if (pi->pi_owner == td)
1921 				umtx_pi_disown(pi);
1922 			mtx_unlock(&umtx_lock);
1923 		}
1924 	}
1925 	umtxq_unlock(&key);
1926 
1927 	/*
1928 	 * When unlocking the umtx, it must be marked as unowned if
1929 	 * there is zero or one thread only waiting for it.
1930 	 * Otherwise, it must be marked as contested.
1931 	 */
1932 	error = casueword32(&m->m_owner, owner, &old,
1933 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1934 
1935 	umtxq_unbusy_unlocked(&key);
1936 	umtx_key_release(&key);
1937 	if (error == -1)
1938 		return (EFAULT);
1939 	if (old != owner)
1940 		return (EINVAL);
1941 	return (0);
1942 }
1943 
1944 /*
1945  * Lock a PP mutex.
1946  */
1947 static int
1948 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1949     struct _umtx_time *timeout, int try)
1950 {
1951 	struct abs_timeout timo;
1952 	struct umtx_q *uq, *uq2;
1953 	struct umtx_pi *pi;
1954 	uint32_t ceiling;
1955 	uint32_t owner, id;
1956 	int error, pri, old_inherited_pri, su, rv;
1957 
1958 	id = td->td_tid;
1959 	uq = td->td_umtxq;
1960 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1961 	    &uq->uq_key)) != 0)
1962 		return (error);
1963 
1964 	if (timeout != NULL)
1965 		abs_timeout_init2(&timo, timeout);
1966 
1967 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1968 	for (;;) {
1969 		old_inherited_pri = uq->uq_inherited_pri;
1970 		umtxq_lock(&uq->uq_key);
1971 		umtxq_busy(&uq->uq_key);
1972 		umtxq_unlock(&uq->uq_key);
1973 
1974 		rv = fueword32(&m->m_ceilings[0], &ceiling);
1975 		if (rv == -1) {
1976 			error = EFAULT;
1977 			goto out;
1978 		}
1979 		ceiling = RTP_PRIO_MAX - ceiling;
1980 		if (ceiling > RTP_PRIO_MAX) {
1981 			error = EINVAL;
1982 			goto out;
1983 		}
1984 
1985 		mtx_lock(&umtx_lock);
1986 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1987 			mtx_unlock(&umtx_lock);
1988 			error = EINVAL;
1989 			goto out;
1990 		}
1991 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1992 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1993 			thread_lock(td);
1994 			if (uq->uq_inherited_pri < UPRI(td))
1995 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1996 			thread_unlock(td);
1997 		}
1998 		mtx_unlock(&umtx_lock);
1999 
2000 		rv = casueword32(&m->m_owner,
2001 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2002 		/* The address was invalid. */
2003 		if (rv == -1) {
2004 			error = EFAULT;
2005 			break;
2006 		}
2007 
2008 		if (owner == UMUTEX_CONTESTED) {
2009 			error = 0;
2010 			break;
2011 		}
2012 
2013 		if (try != 0) {
2014 			error = EBUSY;
2015 			break;
2016 		}
2017 
2018 		/*
2019 		 * If we caught a signal, we have retried and now
2020 		 * exit immediately.
2021 		 */
2022 		if (error != 0)
2023 			break;
2024 
2025 		umtxq_lock(&uq->uq_key);
2026 		umtxq_insert(uq);
2027 		umtxq_unbusy(&uq->uq_key);
2028 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2029 		    NULL : &timo);
2030 		umtxq_remove(uq);
2031 		umtxq_unlock(&uq->uq_key);
2032 
2033 		mtx_lock(&umtx_lock);
2034 		uq->uq_inherited_pri = old_inherited_pri;
2035 		pri = PRI_MAX;
2036 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2037 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2038 			if (uq2 != NULL) {
2039 				if (pri > UPRI(uq2->uq_thread))
2040 					pri = UPRI(uq2->uq_thread);
2041 			}
2042 		}
2043 		if (pri > uq->uq_inherited_pri)
2044 			pri = uq->uq_inherited_pri;
2045 		thread_lock(td);
2046 		sched_lend_user_prio(td, pri);
2047 		thread_unlock(td);
2048 		mtx_unlock(&umtx_lock);
2049 	}
2050 
2051 	if (error != 0) {
2052 		mtx_lock(&umtx_lock);
2053 		uq->uq_inherited_pri = old_inherited_pri;
2054 		pri = PRI_MAX;
2055 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2056 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2057 			if (uq2 != NULL) {
2058 				if (pri > UPRI(uq2->uq_thread))
2059 					pri = UPRI(uq2->uq_thread);
2060 			}
2061 		}
2062 		if (pri > uq->uq_inherited_pri)
2063 			pri = uq->uq_inherited_pri;
2064 		thread_lock(td);
2065 		sched_lend_user_prio(td, pri);
2066 		thread_unlock(td);
2067 		mtx_unlock(&umtx_lock);
2068 	}
2069 
2070 out:
2071 	umtxq_unbusy_unlocked(&uq->uq_key);
2072 	umtx_key_release(&uq->uq_key);
2073 	return (error);
2074 }
2075 
2076 /*
2077  * Unlock a PP mutex.
2078  */
2079 static int
2080 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2081 {
2082 	struct umtx_key key;
2083 	struct umtx_q *uq, *uq2;
2084 	struct umtx_pi *pi;
2085 	uint32_t owner, id;
2086 	uint32_t rceiling;
2087 	int error, pri, new_inherited_pri, su;
2088 
2089 	id = td->td_tid;
2090 	uq = td->td_umtxq;
2091 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2092 
2093 	/*
2094 	 * Make sure we own this mtx.
2095 	 */
2096 	error = fueword32(&m->m_owner, &owner);
2097 	if (error == -1)
2098 		return (EFAULT);
2099 
2100 	if ((owner & ~UMUTEX_CONTESTED) != id)
2101 		return (EPERM);
2102 
2103 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2104 	if (error != 0)
2105 		return (error);
2106 
2107 	if (rceiling == -1)
2108 		new_inherited_pri = PRI_MAX;
2109 	else {
2110 		rceiling = RTP_PRIO_MAX - rceiling;
2111 		if (rceiling > RTP_PRIO_MAX)
2112 			return (EINVAL);
2113 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2114 	}
2115 
2116 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2117 	    &key)) != 0)
2118 		return (error);
2119 	umtxq_lock(&key);
2120 	umtxq_busy(&key);
2121 	umtxq_unlock(&key);
2122 	/*
2123 	 * For priority protected mutex, always set unlocked state
2124 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2125 	 * to lock the mutex, it is necessary because thread priority
2126 	 * has to be adjusted for such mutex.
2127 	 */
2128 	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2129 
2130 	umtxq_lock(&key);
2131 	if (error == 0)
2132 		umtxq_signal(&key, 1);
2133 	umtxq_unbusy(&key);
2134 	umtxq_unlock(&key);
2135 
2136 	if (error == -1)
2137 		error = EFAULT;
2138 	else {
2139 		mtx_lock(&umtx_lock);
2140 		if (su != 0)
2141 			uq->uq_inherited_pri = new_inherited_pri;
2142 		pri = PRI_MAX;
2143 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2144 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2145 			if (uq2 != NULL) {
2146 				if (pri > UPRI(uq2->uq_thread))
2147 					pri = UPRI(uq2->uq_thread);
2148 			}
2149 		}
2150 		if (pri > uq->uq_inherited_pri)
2151 			pri = uq->uq_inherited_pri;
2152 		thread_lock(td);
2153 		sched_lend_user_prio(td, pri);
2154 		thread_unlock(td);
2155 		mtx_unlock(&umtx_lock);
2156 	}
2157 	umtx_key_release(&key);
2158 	return (error);
2159 }
2160 
2161 static int
2162 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2163 	uint32_t *old_ceiling)
2164 {
2165 	struct umtx_q *uq;
2166 	uint32_t save_ceiling;
2167 	uint32_t owner, id;
2168 	uint32_t flags;
2169 	int error, rv;
2170 
2171 	error = fueword32(&m->m_flags, &flags);
2172 	if (error == -1)
2173 		return (EFAULT);
2174 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2175 		return (EINVAL);
2176 	if (ceiling > RTP_PRIO_MAX)
2177 		return (EINVAL);
2178 	id = td->td_tid;
2179 	uq = td->td_umtxq;
2180 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2181 	   &uq->uq_key)) != 0)
2182 		return (error);
2183 	for (;;) {
2184 		umtxq_lock(&uq->uq_key);
2185 		umtxq_busy(&uq->uq_key);
2186 		umtxq_unlock(&uq->uq_key);
2187 
2188 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2189 		if (rv == -1) {
2190 			error = EFAULT;
2191 			break;
2192 		}
2193 
2194 		rv = casueword32(&m->m_owner,
2195 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2196 		if (rv == -1) {
2197 			error = EFAULT;
2198 			break;
2199 		}
2200 
2201 		if (owner == UMUTEX_CONTESTED) {
2202 			suword32(&m->m_ceilings[0], ceiling);
2203 			suword32(&m->m_owner, UMUTEX_CONTESTED);
2204 			error = 0;
2205 			break;
2206 		}
2207 
2208 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2209 			suword32(&m->m_ceilings[0], ceiling);
2210 			error = 0;
2211 			break;
2212 		}
2213 
2214 		/*
2215 		 * If we caught a signal, we have retried and now
2216 		 * exit immediately.
2217 		 */
2218 		if (error != 0)
2219 			break;
2220 
2221 		/*
2222 		 * We set the contested bit, sleep. Otherwise the lock changed
2223 		 * and we need to retry or we lost a race to the thread
2224 		 * unlocking the umtx.
2225 		 */
2226 		umtxq_lock(&uq->uq_key);
2227 		umtxq_insert(uq);
2228 		umtxq_unbusy(&uq->uq_key);
2229 		error = umtxq_sleep(uq, "umtxpp", NULL);
2230 		umtxq_remove(uq);
2231 		umtxq_unlock(&uq->uq_key);
2232 	}
2233 	umtxq_lock(&uq->uq_key);
2234 	if (error == 0)
2235 		umtxq_signal(&uq->uq_key, INT_MAX);
2236 	umtxq_unbusy(&uq->uq_key);
2237 	umtxq_unlock(&uq->uq_key);
2238 	umtx_key_release(&uq->uq_key);
2239 	if (error == 0 && old_ceiling != NULL)
2240 		suword32(old_ceiling, save_ceiling);
2241 	return (error);
2242 }
2243 
2244 /*
2245  * Lock a userland POSIX mutex.
2246  */
2247 static int
2248 do_lock_umutex(struct thread *td, struct umutex *m,
2249     struct _umtx_time *timeout, int mode)
2250 {
2251 	uint32_t flags;
2252 	int error;
2253 
2254 	error = fueword32(&m->m_flags, &flags);
2255 	if (error == -1)
2256 		return (EFAULT);
2257 
2258 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2259 	case 0:
2260 		error = do_lock_normal(td, m, flags, timeout, mode);
2261 		break;
2262 	case UMUTEX_PRIO_INHERIT:
2263 		error = do_lock_pi(td, m, flags, timeout, mode);
2264 		break;
2265 	case UMUTEX_PRIO_PROTECT:
2266 		error = do_lock_pp(td, m, flags, timeout, mode);
2267 		break;
2268 	default:
2269 		return (EINVAL);
2270 	}
2271 	if (timeout == NULL) {
2272 		if (error == EINTR && mode != _UMUTEX_WAIT)
2273 			error = ERESTART;
2274 	} else {
2275 		/* Timed-locking is not restarted. */
2276 		if (error == ERESTART)
2277 			error = EINTR;
2278 	}
2279 	return (error);
2280 }
2281 
2282 /*
2283  * Unlock a userland POSIX mutex.
2284  */
2285 static int
2286 do_unlock_umutex(struct thread *td, struct umutex *m)
2287 {
2288 	uint32_t flags;
2289 	int error;
2290 
2291 	error = fueword32(&m->m_flags, &flags);
2292 	if (error == -1)
2293 		return (EFAULT);
2294 
2295 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2296 	case 0:
2297 		return (do_unlock_normal(td, m, flags));
2298 	case UMUTEX_PRIO_INHERIT:
2299 		return (do_unlock_pi(td, m, flags));
2300 	case UMUTEX_PRIO_PROTECT:
2301 		return (do_unlock_pp(td, m, flags));
2302 	}
2303 
2304 	return (EINVAL);
2305 }
2306 
2307 static int
2308 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2309 	struct timespec *timeout, u_long wflags)
2310 {
2311 	struct abs_timeout timo;
2312 	struct umtx_q *uq;
2313 	uint32_t flags, clockid, hasw;
2314 	int error;
2315 
2316 	uq = td->td_umtxq;
2317 	error = fueword32(&cv->c_flags, &flags);
2318 	if (error == -1)
2319 		return (EFAULT);
2320 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2321 	if (error != 0)
2322 		return (error);
2323 
2324 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2325 		error = fueword32(&cv->c_clockid, &clockid);
2326 		if (error == -1) {
2327 			umtx_key_release(&uq->uq_key);
2328 			return (EFAULT);
2329 		}
2330 		if (clockid < CLOCK_REALTIME ||
2331 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2332 			/* hmm, only HW clock id will work. */
2333 			umtx_key_release(&uq->uq_key);
2334 			return (EINVAL);
2335 		}
2336 	} else {
2337 		clockid = CLOCK_REALTIME;
2338 	}
2339 
2340 	umtxq_lock(&uq->uq_key);
2341 	umtxq_busy(&uq->uq_key);
2342 	umtxq_insert(uq);
2343 	umtxq_unlock(&uq->uq_key);
2344 
2345 	/*
2346 	 * Set c_has_waiters to 1 before releasing user mutex, also
2347 	 * don't modify cache line when unnecessary.
2348 	 */
2349 	error = fueword32(&cv->c_has_waiters, &hasw);
2350 	if (error == 0 && hasw == 0)
2351 		suword32(&cv->c_has_waiters, 1);
2352 
2353 	umtxq_unbusy_unlocked(&uq->uq_key);
2354 
2355 	error = do_unlock_umutex(td, m);
2356 
2357 	if (timeout != NULL)
2358 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2359 			timeout);
2360 
2361 	umtxq_lock(&uq->uq_key);
2362 	if (error == 0) {
2363 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2364 		    NULL : &timo);
2365 	}
2366 
2367 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2368 		error = 0;
2369 	else {
2370 		/*
2371 		 * This must be timeout,interrupted by signal or
2372 		 * surprious wakeup, clear c_has_waiter flag when
2373 		 * necessary.
2374 		 */
2375 		umtxq_busy(&uq->uq_key);
2376 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2377 			int oldlen = uq->uq_cur_queue->length;
2378 			umtxq_remove(uq);
2379 			if (oldlen == 1) {
2380 				umtxq_unlock(&uq->uq_key);
2381 				suword32(&cv->c_has_waiters, 0);
2382 				umtxq_lock(&uq->uq_key);
2383 			}
2384 		}
2385 		umtxq_unbusy(&uq->uq_key);
2386 		if (error == ERESTART)
2387 			error = EINTR;
2388 	}
2389 
2390 	umtxq_unlock(&uq->uq_key);
2391 	umtx_key_release(&uq->uq_key);
2392 	return (error);
2393 }
2394 
2395 /*
2396  * Signal a userland condition variable.
2397  */
2398 static int
2399 do_cv_signal(struct thread *td, struct ucond *cv)
2400 {
2401 	struct umtx_key key;
2402 	int error, cnt, nwake;
2403 	uint32_t flags;
2404 
2405 	error = fueword32(&cv->c_flags, &flags);
2406 	if (error == -1)
2407 		return (EFAULT);
2408 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2409 		return (error);
2410 	umtxq_lock(&key);
2411 	umtxq_busy(&key);
2412 	cnt = umtxq_count(&key);
2413 	nwake = umtxq_signal(&key, 1);
2414 	if (cnt <= nwake) {
2415 		umtxq_unlock(&key);
2416 		error = suword32(&cv->c_has_waiters, 0);
2417 		if (error == -1)
2418 			error = EFAULT;
2419 		umtxq_lock(&key);
2420 	}
2421 	umtxq_unbusy(&key);
2422 	umtxq_unlock(&key);
2423 	umtx_key_release(&key);
2424 	return (error);
2425 }
2426 
2427 static int
2428 do_cv_broadcast(struct thread *td, struct ucond *cv)
2429 {
2430 	struct umtx_key key;
2431 	int error;
2432 	uint32_t flags;
2433 
2434 	error = fueword32(&cv->c_flags, &flags);
2435 	if (error == -1)
2436 		return (EFAULT);
2437 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2438 		return (error);
2439 
2440 	umtxq_lock(&key);
2441 	umtxq_busy(&key);
2442 	umtxq_signal(&key, INT_MAX);
2443 	umtxq_unlock(&key);
2444 
2445 	error = suword32(&cv->c_has_waiters, 0);
2446 	if (error == -1)
2447 		error = EFAULT;
2448 
2449 	umtxq_unbusy_unlocked(&key);
2450 
2451 	umtx_key_release(&key);
2452 	return (error);
2453 }
2454 
2455 static int
2456 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2457 {
2458 	struct abs_timeout timo;
2459 	struct umtx_q *uq;
2460 	uint32_t flags, wrflags;
2461 	int32_t state, oldstate;
2462 	int32_t blocked_readers;
2463 	int error, rv;
2464 
2465 	uq = td->td_umtxq;
2466 	error = fueword32(&rwlock->rw_flags, &flags);
2467 	if (error == -1)
2468 		return (EFAULT);
2469 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2470 	if (error != 0)
2471 		return (error);
2472 
2473 	if (timeout != NULL)
2474 		abs_timeout_init2(&timo, timeout);
2475 
2476 	wrflags = URWLOCK_WRITE_OWNER;
2477 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2478 		wrflags |= URWLOCK_WRITE_WAITERS;
2479 
2480 	for (;;) {
2481 		rv = fueword32(&rwlock->rw_state, &state);
2482 		if (rv == -1) {
2483 			umtx_key_release(&uq->uq_key);
2484 			return (EFAULT);
2485 		}
2486 
2487 		/* try to lock it */
2488 		while (!(state & wrflags)) {
2489 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2490 				umtx_key_release(&uq->uq_key);
2491 				return (EAGAIN);
2492 			}
2493 			rv = casueword32(&rwlock->rw_state, state,
2494 			    &oldstate, state + 1);
2495 			if (rv == -1) {
2496 				umtx_key_release(&uq->uq_key);
2497 				return (EFAULT);
2498 			}
2499 			if (oldstate == state) {
2500 				umtx_key_release(&uq->uq_key);
2501 				return (0);
2502 			}
2503 			error = umtxq_check_susp(td);
2504 			if (error != 0)
2505 				break;
2506 			state = oldstate;
2507 		}
2508 
2509 		if (error)
2510 			break;
2511 
2512 		/* grab monitor lock */
2513 		umtxq_lock(&uq->uq_key);
2514 		umtxq_busy(&uq->uq_key);
2515 		umtxq_unlock(&uq->uq_key);
2516 
2517 		/*
2518 		 * re-read the state, in case it changed between the try-lock above
2519 		 * and the check below
2520 		 */
2521 		rv = fueword32(&rwlock->rw_state, &state);
2522 		if (rv == -1)
2523 			error = EFAULT;
2524 
2525 		/* set read contention bit */
2526 		while (error == 0 && (state & wrflags) &&
2527 		    !(state & URWLOCK_READ_WAITERS)) {
2528 			rv = casueword32(&rwlock->rw_state, state,
2529 			    &oldstate, state | URWLOCK_READ_WAITERS);
2530 			if (rv == -1) {
2531 				error = EFAULT;
2532 				break;
2533 			}
2534 			if (oldstate == state)
2535 				goto sleep;
2536 			state = oldstate;
2537 			error = umtxq_check_susp(td);
2538 			if (error != 0)
2539 				break;
2540 		}
2541 		if (error != 0) {
2542 			umtxq_unbusy_unlocked(&uq->uq_key);
2543 			break;
2544 		}
2545 
2546 		/* state is changed while setting flags, restart */
2547 		if (!(state & wrflags)) {
2548 			umtxq_unbusy_unlocked(&uq->uq_key);
2549 			error = umtxq_check_susp(td);
2550 			if (error != 0)
2551 				break;
2552 			continue;
2553 		}
2554 
2555 sleep:
2556 		/* contention bit is set, before sleeping, increase read waiter count */
2557 		rv = fueword32(&rwlock->rw_blocked_readers,
2558 		    &blocked_readers);
2559 		if (rv == -1) {
2560 			umtxq_unbusy_unlocked(&uq->uq_key);
2561 			error = EFAULT;
2562 			break;
2563 		}
2564 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2565 
2566 		while (state & wrflags) {
2567 			umtxq_lock(&uq->uq_key);
2568 			umtxq_insert(uq);
2569 			umtxq_unbusy(&uq->uq_key);
2570 
2571 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2572 			    NULL : &timo);
2573 
2574 			umtxq_busy(&uq->uq_key);
2575 			umtxq_remove(uq);
2576 			umtxq_unlock(&uq->uq_key);
2577 			if (error)
2578 				break;
2579 			rv = fueword32(&rwlock->rw_state, &state);
2580 			if (rv == -1) {
2581 				error = EFAULT;
2582 				break;
2583 			}
2584 		}
2585 
2586 		/* decrease read waiter count, and may clear read contention bit */
2587 		rv = fueword32(&rwlock->rw_blocked_readers,
2588 		    &blocked_readers);
2589 		if (rv == -1) {
2590 			umtxq_unbusy_unlocked(&uq->uq_key);
2591 			error = EFAULT;
2592 			break;
2593 		}
2594 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2595 		if (blocked_readers == 1) {
2596 			rv = fueword32(&rwlock->rw_state, &state);
2597 			if (rv == -1)
2598 				error = EFAULT;
2599 			while (error == 0) {
2600 				rv = casueword32(&rwlock->rw_state, state,
2601 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2602 				if (rv == -1) {
2603 					error = EFAULT;
2604 					break;
2605 				}
2606 				if (oldstate == state)
2607 					break;
2608 				state = oldstate;
2609 				error = umtxq_check_susp(td);
2610 			}
2611 		}
2612 
2613 		umtxq_unbusy_unlocked(&uq->uq_key);
2614 		if (error != 0)
2615 			break;
2616 	}
2617 	umtx_key_release(&uq->uq_key);
2618 	if (error == ERESTART)
2619 		error = EINTR;
2620 	return (error);
2621 }
2622 
2623 static int
2624 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2625 {
2626 	struct abs_timeout timo;
2627 	struct umtx_q *uq;
2628 	uint32_t flags;
2629 	int32_t state, oldstate;
2630 	int32_t blocked_writers;
2631 	int32_t blocked_readers;
2632 	int error, rv;
2633 
2634 	uq = td->td_umtxq;
2635 	error = fueword32(&rwlock->rw_flags, &flags);
2636 	if (error == -1)
2637 		return (EFAULT);
2638 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2639 	if (error != 0)
2640 		return (error);
2641 
2642 	if (timeout != NULL)
2643 		abs_timeout_init2(&timo, timeout);
2644 
2645 	blocked_readers = 0;
2646 	for (;;) {
2647 		rv = fueword32(&rwlock->rw_state, &state);
2648 		if (rv == -1) {
2649 			umtx_key_release(&uq->uq_key);
2650 			return (EFAULT);
2651 		}
2652 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2653 			rv = casueword32(&rwlock->rw_state, state,
2654 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2655 			if (rv == -1) {
2656 				umtx_key_release(&uq->uq_key);
2657 				return (EFAULT);
2658 			}
2659 			if (oldstate == state) {
2660 				umtx_key_release(&uq->uq_key);
2661 				return (0);
2662 			}
2663 			state = oldstate;
2664 			error = umtxq_check_susp(td);
2665 			if (error != 0)
2666 				break;
2667 		}
2668 
2669 		if (error) {
2670 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2671 			    blocked_readers != 0) {
2672 				umtxq_lock(&uq->uq_key);
2673 				umtxq_busy(&uq->uq_key);
2674 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2675 				umtxq_unbusy(&uq->uq_key);
2676 				umtxq_unlock(&uq->uq_key);
2677 			}
2678 
2679 			break;
2680 		}
2681 
2682 		/* grab monitor lock */
2683 		umtxq_lock(&uq->uq_key);
2684 		umtxq_busy(&uq->uq_key);
2685 		umtxq_unlock(&uq->uq_key);
2686 
2687 		/*
2688 		 * re-read the state, in case it changed between the try-lock above
2689 		 * and the check below
2690 		 */
2691 		rv = fueword32(&rwlock->rw_state, &state);
2692 		if (rv == -1)
2693 			error = EFAULT;
2694 
2695 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2696 		    URWLOCK_READER_COUNT(state) != 0) &&
2697 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2698 			rv = casueword32(&rwlock->rw_state, state,
2699 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2700 			if (rv == -1) {
2701 				error = EFAULT;
2702 				break;
2703 			}
2704 			if (oldstate == state)
2705 				goto sleep;
2706 			state = oldstate;
2707 			error = umtxq_check_susp(td);
2708 			if (error != 0)
2709 				break;
2710 		}
2711 		if (error != 0) {
2712 			umtxq_unbusy_unlocked(&uq->uq_key);
2713 			break;
2714 		}
2715 
2716 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2717 			umtxq_unbusy_unlocked(&uq->uq_key);
2718 			error = umtxq_check_susp(td);
2719 			if (error != 0)
2720 				break;
2721 			continue;
2722 		}
2723 sleep:
2724 		rv = fueword32(&rwlock->rw_blocked_writers,
2725 		    &blocked_writers);
2726 		if (rv == -1) {
2727 			umtxq_unbusy_unlocked(&uq->uq_key);
2728 			error = EFAULT;
2729 			break;
2730 		}
2731 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2732 
2733 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2734 			umtxq_lock(&uq->uq_key);
2735 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2736 			umtxq_unbusy(&uq->uq_key);
2737 
2738 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2739 			    NULL : &timo);
2740 
2741 			umtxq_busy(&uq->uq_key);
2742 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2743 			umtxq_unlock(&uq->uq_key);
2744 			if (error)
2745 				break;
2746 			rv = fueword32(&rwlock->rw_state, &state);
2747 			if (rv == -1) {
2748 				error = EFAULT;
2749 				break;
2750 			}
2751 		}
2752 
2753 		rv = fueword32(&rwlock->rw_blocked_writers,
2754 		    &blocked_writers);
2755 		if (rv == -1) {
2756 			umtxq_unbusy_unlocked(&uq->uq_key);
2757 			error = EFAULT;
2758 			break;
2759 		}
2760 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2761 		if (blocked_writers == 1) {
2762 			rv = fueword32(&rwlock->rw_state, &state);
2763 			if (rv == -1) {
2764 				umtxq_unbusy_unlocked(&uq->uq_key);
2765 				error = EFAULT;
2766 				break;
2767 			}
2768 			for (;;) {
2769 				rv = casueword32(&rwlock->rw_state, state,
2770 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2771 				if (rv == -1) {
2772 					error = EFAULT;
2773 					break;
2774 				}
2775 				if (oldstate == state)
2776 					break;
2777 				state = oldstate;
2778 				error = umtxq_check_susp(td);
2779 				/*
2780 				 * We are leaving the URWLOCK_WRITE_WAITERS
2781 				 * behind, but this should not harm the
2782 				 * correctness.
2783 				 */
2784 				if (error != 0)
2785 					break;
2786 			}
2787 			rv = fueword32(&rwlock->rw_blocked_readers,
2788 			    &blocked_readers);
2789 			if (rv == -1) {
2790 				umtxq_unbusy_unlocked(&uq->uq_key);
2791 				error = EFAULT;
2792 				break;
2793 			}
2794 		} else
2795 			blocked_readers = 0;
2796 
2797 		umtxq_unbusy_unlocked(&uq->uq_key);
2798 	}
2799 
2800 	umtx_key_release(&uq->uq_key);
2801 	if (error == ERESTART)
2802 		error = EINTR;
2803 	return (error);
2804 }
2805 
2806 static int
2807 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2808 {
2809 	struct umtx_q *uq;
2810 	uint32_t flags;
2811 	int32_t state, oldstate;
2812 	int error, rv, q, count;
2813 
2814 	uq = td->td_umtxq;
2815 	error = fueword32(&rwlock->rw_flags, &flags);
2816 	if (error == -1)
2817 		return (EFAULT);
2818 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2819 	if (error != 0)
2820 		return (error);
2821 
2822 	error = fueword32(&rwlock->rw_state, &state);
2823 	if (error == -1) {
2824 		error = EFAULT;
2825 		goto out;
2826 	}
2827 	if (state & URWLOCK_WRITE_OWNER) {
2828 		for (;;) {
2829 			rv = casueword32(&rwlock->rw_state, state,
2830 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
2831 			if (rv == -1) {
2832 				error = EFAULT;
2833 				goto out;
2834 			}
2835 			if (oldstate != state) {
2836 				state = oldstate;
2837 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2838 					error = EPERM;
2839 					goto out;
2840 				}
2841 				error = umtxq_check_susp(td);
2842 				if (error != 0)
2843 					goto out;
2844 			} else
2845 				break;
2846 		}
2847 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2848 		for (;;) {
2849 			rv = casueword32(&rwlock->rw_state, state,
2850 			    &oldstate, state - 1);
2851 			if (rv == -1) {
2852 				error = EFAULT;
2853 				goto out;
2854 			}
2855 			if (oldstate != state) {
2856 				state = oldstate;
2857 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2858 					error = EPERM;
2859 					goto out;
2860 				}
2861 				error = umtxq_check_susp(td);
2862 				if (error != 0)
2863 					goto out;
2864 			} else
2865 				break;
2866 		}
2867 	} else {
2868 		error = EPERM;
2869 		goto out;
2870 	}
2871 
2872 	count = 0;
2873 
2874 	if (!(flags & URWLOCK_PREFER_READER)) {
2875 		if (state & URWLOCK_WRITE_WAITERS) {
2876 			count = 1;
2877 			q = UMTX_EXCLUSIVE_QUEUE;
2878 		} else if (state & URWLOCK_READ_WAITERS) {
2879 			count = INT_MAX;
2880 			q = UMTX_SHARED_QUEUE;
2881 		}
2882 	} else {
2883 		if (state & URWLOCK_READ_WAITERS) {
2884 			count = INT_MAX;
2885 			q = UMTX_SHARED_QUEUE;
2886 		} else if (state & URWLOCK_WRITE_WAITERS) {
2887 			count = 1;
2888 			q = UMTX_EXCLUSIVE_QUEUE;
2889 		}
2890 	}
2891 
2892 	if (count) {
2893 		umtxq_lock(&uq->uq_key);
2894 		umtxq_busy(&uq->uq_key);
2895 		umtxq_signal_queue(&uq->uq_key, count, q);
2896 		umtxq_unbusy(&uq->uq_key);
2897 		umtxq_unlock(&uq->uq_key);
2898 	}
2899 out:
2900 	umtx_key_release(&uq->uq_key);
2901 	return (error);
2902 }
2903 
2904 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
2905 static int
2906 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2907 {
2908 	struct abs_timeout timo;
2909 	struct umtx_q *uq;
2910 	uint32_t flags, count, count1;
2911 	int error, rv;
2912 
2913 	uq = td->td_umtxq;
2914 	error = fueword32(&sem->_flags, &flags);
2915 	if (error == -1)
2916 		return (EFAULT);
2917 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2918 	if (error != 0)
2919 		return (error);
2920 
2921 	if (timeout != NULL)
2922 		abs_timeout_init2(&timo, timeout);
2923 
2924 	umtxq_lock(&uq->uq_key);
2925 	umtxq_busy(&uq->uq_key);
2926 	umtxq_insert(uq);
2927 	umtxq_unlock(&uq->uq_key);
2928 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
2929 	if (rv == 0)
2930 		rv = fueword32(&sem->_count, &count);
2931 	if (rv == -1 || count != 0) {
2932 		umtxq_lock(&uq->uq_key);
2933 		umtxq_unbusy(&uq->uq_key);
2934 		umtxq_remove(uq);
2935 		umtxq_unlock(&uq->uq_key);
2936 		umtx_key_release(&uq->uq_key);
2937 		return (rv == -1 ? EFAULT : 0);
2938 	}
2939 	umtxq_lock(&uq->uq_key);
2940 	umtxq_unbusy(&uq->uq_key);
2941 
2942 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2943 
2944 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2945 		error = 0;
2946 	else {
2947 		umtxq_remove(uq);
2948 		/* A relative timeout cannot be restarted. */
2949 		if (error == ERESTART && timeout != NULL &&
2950 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2951 			error = EINTR;
2952 	}
2953 	umtxq_unlock(&uq->uq_key);
2954 	umtx_key_release(&uq->uq_key);
2955 	return (error);
2956 }
2957 
2958 /*
2959  * Signal a userland semaphore.
2960  */
2961 static int
2962 do_sem_wake(struct thread *td, struct _usem *sem)
2963 {
2964 	struct umtx_key key;
2965 	int error, cnt;
2966 	uint32_t flags;
2967 
2968 	error = fueword32(&sem->_flags, &flags);
2969 	if (error == -1)
2970 		return (EFAULT);
2971 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2972 		return (error);
2973 	umtxq_lock(&key);
2974 	umtxq_busy(&key);
2975 	cnt = umtxq_count(&key);
2976 	if (cnt > 0) {
2977 		umtxq_signal(&key, 1);
2978 		/*
2979 		 * Check if count is greater than 0, this means the memory is
2980 		 * still being referenced by user code, so we can safely
2981 		 * update _has_waiters flag.
2982 		 */
2983 		if (cnt == 1) {
2984 			umtxq_unlock(&key);
2985 			error = suword32(&sem->_has_waiters, 0);
2986 			umtxq_lock(&key);
2987 			if (error == -1)
2988 				error = EFAULT;
2989 		}
2990 	}
2991 	umtxq_unbusy(&key);
2992 	umtxq_unlock(&key);
2993 	umtx_key_release(&key);
2994 	return (error);
2995 }
2996 #endif
2997 
2998 static int
2999 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3000 {
3001 	struct abs_timeout timo;
3002 	struct umtx_q *uq;
3003 	uint32_t count, flags;
3004 	int error, rv;
3005 
3006 	uq = td->td_umtxq;
3007 	flags = fuword32(&sem->_flags);
3008 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3009 	if (error != 0)
3010 		return (error);
3011 
3012 	if (timeout != NULL)
3013 		abs_timeout_init2(&timo, timeout);
3014 
3015 	umtxq_lock(&uq->uq_key);
3016 	umtxq_busy(&uq->uq_key);
3017 	umtxq_insert(uq);
3018 	umtxq_unlock(&uq->uq_key);
3019 	rv = fueword32(&sem->_count, &count);
3020 	if (rv == -1) {
3021 		umtxq_lock(&uq->uq_key);
3022 		umtxq_unbusy(&uq->uq_key);
3023 		umtxq_remove(uq);
3024 		umtxq_unlock(&uq->uq_key);
3025 		umtx_key_release(&uq->uq_key);
3026 		return (EFAULT);
3027 	}
3028 	for (;;) {
3029 		if (USEM_COUNT(count) != 0) {
3030 			umtxq_lock(&uq->uq_key);
3031 			umtxq_unbusy(&uq->uq_key);
3032 			umtxq_remove(uq);
3033 			umtxq_unlock(&uq->uq_key);
3034 			umtx_key_release(&uq->uq_key);
3035 			return (0);
3036 		}
3037 		if (count == USEM_HAS_WAITERS)
3038 			break;
3039 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3040 		if (rv == -1) {
3041 			umtxq_lock(&uq->uq_key);
3042 			umtxq_unbusy(&uq->uq_key);
3043 			umtxq_remove(uq);
3044 			umtxq_unlock(&uq->uq_key);
3045 			umtx_key_release(&uq->uq_key);
3046 			return (EFAULT);
3047 		}
3048 		if (count == 0)
3049 			break;
3050 	}
3051 	umtxq_lock(&uq->uq_key);
3052 	umtxq_unbusy(&uq->uq_key);
3053 
3054 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3055 
3056 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3057 		error = 0;
3058 	else {
3059 		umtxq_remove(uq);
3060 		/* A relative timeout cannot be restarted. */
3061 		if (error == ERESTART && timeout != NULL &&
3062 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3063 			error = EINTR;
3064 	}
3065 	umtxq_unlock(&uq->uq_key);
3066 	umtx_key_release(&uq->uq_key);
3067 	return (error);
3068 }
3069 
3070 /*
3071  * Signal a userland semaphore.
3072  */
3073 static int
3074 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3075 {
3076 	struct umtx_key key;
3077 	int error, cnt, rv;
3078 	uint32_t count, flags;
3079 
3080 	rv = fueword32(&sem->_flags, &flags);
3081 	if (rv == -1)
3082 		return (EFAULT);
3083 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3084 		return (error);
3085 	umtxq_lock(&key);
3086 	umtxq_busy(&key);
3087 	cnt = umtxq_count(&key);
3088 	if (cnt > 0) {
3089 		umtxq_signal(&key, 1);
3090 
3091 		/*
3092 		 * If this was the last sleeping thread, clear the waiters
3093 		 * flag in _count.
3094 		 */
3095 		if (cnt == 1) {
3096 			umtxq_unlock(&key);
3097 			rv = fueword32(&sem->_count, &count);
3098 			while (rv != -1 && count & USEM_HAS_WAITERS)
3099 				rv = casueword32(&sem->_count, count, &count,
3100 				    count & ~USEM_HAS_WAITERS);
3101 			if (rv == -1)
3102 				error = EFAULT;
3103 			umtxq_lock(&key);
3104 		}
3105 	}
3106 	umtxq_unbusy(&key);
3107 	umtxq_unlock(&key);
3108 	umtx_key_release(&key);
3109 	return (error);
3110 }
3111 
3112 inline int
3113 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3114 {
3115 	int error;
3116 
3117 	error = copyin(addr, tsp, sizeof(struct timespec));
3118 	if (error == 0) {
3119 		if (tsp->tv_sec < 0 ||
3120 		    tsp->tv_nsec >= 1000000000 ||
3121 		    tsp->tv_nsec < 0)
3122 			error = EINVAL;
3123 	}
3124 	return (error);
3125 }
3126 
3127 static inline int
3128 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3129 {
3130 	int error;
3131 
3132 	if (size <= sizeof(struct timespec)) {
3133 		tp->_clockid = CLOCK_REALTIME;
3134 		tp->_flags = 0;
3135 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3136 	} else
3137 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3138 	if (error != 0)
3139 		return (error);
3140 	if (tp->_timeout.tv_sec < 0 ||
3141 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3142 		return (EINVAL);
3143 	return (0);
3144 }
3145 
3146 static int
3147 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3148 {
3149 
3150 	return (EOPNOTSUPP);
3151 }
3152 
3153 static int
3154 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3155 {
3156 	struct _umtx_time timeout, *tm_p;
3157 	int error;
3158 
3159 	if (uap->uaddr2 == NULL)
3160 		tm_p = NULL;
3161 	else {
3162 		error = umtx_copyin_umtx_time(
3163 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3164 		if (error != 0)
3165 			return (error);
3166 		tm_p = &timeout;
3167 	}
3168 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3169 }
3170 
3171 static int
3172 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3173 {
3174 	struct _umtx_time timeout, *tm_p;
3175 	int error;
3176 
3177 	if (uap->uaddr2 == NULL)
3178 		tm_p = NULL;
3179 	else {
3180 		error = umtx_copyin_umtx_time(
3181 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3182 		if (error != 0)
3183 			return (error);
3184 		tm_p = &timeout;
3185 	}
3186 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3187 }
3188 
3189 static int
3190 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3191 {
3192 	struct _umtx_time *tm_p, timeout;
3193 	int error;
3194 
3195 	if (uap->uaddr2 == NULL)
3196 		tm_p = NULL;
3197 	else {
3198 		error = umtx_copyin_umtx_time(
3199 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3200 		if (error != 0)
3201 			return (error);
3202 		tm_p = &timeout;
3203 	}
3204 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3205 }
3206 
3207 static int
3208 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3209 {
3210 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3211 }
3212 
3213 #define BATCH_SIZE	128
3214 static int
3215 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3216 {
3217 	int count = uap->val;
3218 	void *uaddrs[BATCH_SIZE];
3219 	char **upp = (char **)uap->obj;
3220 	int tocopy;
3221 	int error = 0;
3222 	int i, pos = 0;
3223 
3224 	while (count > 0) {
3225 		tocopy = count;
3226 		if (tocopy > BATCH_SIZE)
3227 			tocopy = BATCH_SIZE;
3228 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3229 		if (error != 0)
3230 			break;
3231 		for (i = 0; i < tocopy; ++i)
3232 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3233 		count -= tocopy;
3234 		pos += tocopy;
3235 	}
3236 	return (error);
3237 }
3238 
3239 static int
3240 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3241 {
3242 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3243 }
3244 
3245 static int
3246 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3247 {
3248 	struct _umtx_time *tm_p, timeout;
3249 	int error;
3250 
3251 	/* Allow a null timespec (wait forever). */
3252 	if (uap->uaddr2 == NULL)
3253 		tm_p = NULL;
3254 	else {
3255 		error = umtx_copyin_umtx_time(
3256 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3257 		if (error != 0)
3258 			return (error);
3259 		tm_p = &timeout;
3260 	}
3261 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3262 }
3263 
3264 static int
3265 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3266 {
3267 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3268 }
3269 
3270 static int
3271 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3272 {
3273 	struct _umtx_time *tm_p, timeout;
3274 	int error;
3275 
3276 	/* Allow a null timespec (wait forever). */
3277 	if (uap->uaddr2 == NULL)
3278 		tm_p = NULL;
3279 	else {
3280 		error = umtx_copyin_umtx_time(
3281 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3282 		if (error != 0)
3283 			return (error);
3284 		tm_p = &timeout;
3285 	}
3286 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3287 }
3288 
3289 static int
3290 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3291 {
3292 	return do_wake_umutex(td, uap->obj);
3293 }
3294 
3295 static int
3296 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3297 {
3298 	return do_unlock_umutex(td, uap->obj);
3299 }
3300 
3301 static int
3302 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3303 {
3304 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3305 }
3306 
3307 static int
3308 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3309 {
3310 	struct timespec *ts, timeout;
3311 	int error;
3312 
3313 	/* Allow a null timespec (wait forever). */
3314 	if (uap->uaddr2 == NULL)
3315 		ts = NULL;
3316 	else {
3317 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3318 		if (error != 0)
3319 			return (error);
3320 		ts = &timeout;
3321 	}
3322 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3323 }
3324 
3325 static int
3326 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3327 {
3328 	return do_cv_signal(td, uap->obj);
3329 }
3330 
3331 static int
3332 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3333 {
3334 	return do_cv_broadcast(td, uap->obj);
3335 }
3336 
3337 static int
3338 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3339 {
3340 	struct _umtx_time timeout;
3341 	int error;
3342 
3343 	/* Allow a null timespec (wait forever). */
3344 	if (uap->uaddr2 == NULL) {
3345 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3346 	} else {
3347 		error = umtx_copyin_umtx_time(uap->uaddr2,
3348 		   (size_t)uap->uaddr1, &timeout);
3349 		if (error != 0)
3350 			return (error);
3351 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3352 	}
3353 	return (error);
3354 }
3355 
3356 static int
3357 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3358 {
3359 	struct _umtx_time timeout;
3360 	int error;
3361 
3362 	/* Allow a null timespec (wait forever). */
3363 	if (uap->uaddr2 == NULL) {
3364 		error = do_rw_wrlock(td, uap->obj, 0);
3365 	} else {
3366 		error = umtx_copyin_umtx_time(uap->uaddr2,
3367 		   (size_t)uap->uaddr1, &timeout);
3368 		if (error != 0)
3369 			return (error);
3370 
3371 		error = do_rw_wrlock(td, uap->obj, &timeout);
3372 	}
3373 	return (error);
3374 }
3375 
3376 static int
3377 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3378 {
3379 	return do_rw_unlock(td, uap->obj);
3380 }
3381 
3382 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3383 static int
3384 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3385 {
3386 	struct _umtx_time *tm_p, timeout;
3387 	int error;
3388 
3389 	/* Allow a null timespec (wait forever). */
3390 	if (uap->uaddr2 == NULL)
3391 		tm_p = NULL;
3392 	else {
3393 		error = umtx_copyin_umtx_time(
3394 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3395 		if (error != 0)
3396 			return (error);
3397 		tm_p = &timeout;
3398 	}
3399 	return (do_sem_wait(td, uap->obj, tm_p));
3400 }
3401 
3402 static int
3403 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3404 {
3405 
3406 	return (do_sem_wake(td, uap->obj));
3407 }
3408 #endif
3409 
3410 static int
3411 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3412 {
3413 
3414 	return (do_wake2_umutex(td, uap->obj, uap->val));
3415 }
3416 
3417 static int
3418 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3419 {
3420 	struct _umtx_time *tm_p, timeout;
3421 	int error;
3422 
3423 	/* Allow a null timespec (wait forever). */
3424 	if (uap->uaddr2 == NULL)
3425 		tm_p = NULL;
3426 	else {
3427 		error = umtx_copyin_umtx_time(
3428 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3429 		if (error != 0)
3430 			return (error);
3431 		tm_p = &timeout;
3432 	}
3433 	return (do_sem2_wait(td, uap->obj, tm_p));
3434 }
3435 
3436 static int
3437 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3438 {
3439 
3440 	return (do_sem2_wake(td, uap->obj));
3441 }
3442 
3443 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3444 
3445 static const _umtx_op_func op_table[] = {
3446 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3447 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3448 	[UMTX_OP_WAIT]		= __umtx_op_wait,
3449 	[UMTX_OP_WAKE]		= __umtx_op_wake,
3450 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
3451 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
3452 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3453 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
3454 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
3455 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
3456 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
3457 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
3458 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
3459 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
3460 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
3461 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
3462 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
3463 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
3464 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
3465 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3466 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
3467 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
3468 #else
3469 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
3470 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
3471 #endif
3472 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
3473 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
3474 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
3475 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
3476 };
3477 
3478 int
3479 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3480 {
3481 
3482 	if ((unsigned)uap->op < nitems(op_table))
3483 		return (*op_table[uap->op])(td, uap);
3484 	return (EINVAL);
3485 }
3486 
3487 #ifdef COMPAT_FREEBSD32
3488 
3489 struct timespec32 {
3490 	int32_t tv_sec;
3491 	int32_t tv_nsec;
3492 };
3493 
3494 struct umtx_time32 {
3495 	struct	timespec32	timeout;
3496 	uint32_t		flags;
3497 	uint32_t		clockid;
3498 };
3499 
3500 static inline int
3501 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3502 {
3503 	struct timespec32 ts32;
3504 	int error;
3505 
3506 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3507 	if (error == 0) {
3508 		if (ts32.tv_sec < 0 ||
3509 		    ts32.tv_nsec >= 1000000000 ||
3510 		    ts32.tv_nsec < 0)
3511 			error = EINVAL;
3512 		else {
3513 			tsp->tv_sec = ts32.tv_sec;
3514 			tsp->tv_nsec = ts32.tv_nsec;
3515 		}
3516 	}
3517 	return (error);
3518 }
3519 
3520 static inline int
3521 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3522 {
3523 	struct umtx_time32 t32;
3524 	int error;
3525 
3526 	t32.clockid = CLOCK_REALTIME;
3527 	t32.flags   = 0;
3528 	if (size <= sizeof(struct timespec32))
3529 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3530 	else
3531 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3532 	if (error != 0)
3533 		return (error);
3534 	if (t32.timeout.tv_sec < 0 ||
3535 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3536 		return (EINVAL);
3537 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3538 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3539 	tp->_flags = t32.flags;
3540 	tp->_clockid = t32.clockid;
3541 	return (0);
3542 }
3543 
3544 static int
3545 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3546 {
3547 	struct _umtx_time *tm_p, timeout;
3548 	int error;
3549 
3550 	if (uap->uaddr2 == NULL)
3551 		tm_p = NULL;
3552 	else {
3553 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3554 			(size_t)uap->uaddr1, &timeout);
3555 		if (error != 0)
3556 			return (error);
3557 		tm_p = &timeout;
3558 	}
3559 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3560 }
3561 
3562 static int
3563 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3564 {
3565 	struct _umtx_time *tm_p, timeout;
3566 	int error;
3567 
3568 	/* Allow a null timespec (wait forever). */
3569 	if (uap->uaddr2 == NULL)
3570 		tm_p = NULL;
3571 	else {
3572 		error = umtx_copyin_umtx_time(uap->uaddr2,
3573 			    (size_t)uap->uaddr1, &timeout);
3574 		if (error != 0)
3575 			return (error);
3576 		tm_p = &timeout;
3577 	}
3578 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3579 }
3580 
3581 static int
3582 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3583 {
3584 	struct _umtx_time *tm_p, timeout;
3585 	int error;
3586 
3587 	/* Allow a null timespec (wait forever). */
3588 	if (uap->uaddr2 == NULL)
3589 		tm_p = NULL;
3590 	else {
3591 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3592 		    (size_t)uap->uaddr1, &timeout);
3593 		if (error != 0)
3594 			return (error);
3595 		tm_p = &timeout;
3596 	}
3597 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3598 }
3599 
3600 static int
3601 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3602 {
3603 	struct timespec *ts, timeout;
3604 	int error;
3605 
3606 	/* Allow a null timespec (wait forever). */
3607 	if (uap->uaddr2 == NULL)
3608 		ts = NULL;
3609 	else {
3610 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3611 		if (error != 0)
3612 			return (error);
3613 		ts = &timeout;
3614 	}
3615 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3616 }
3617 
3618 static int
3619 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3620 {
3621 	struct _umtx_time timeout;
3622 	int error;
3623 
3624 	/* Allow a null timespec (wait forever). */
3625 	if (uap->uaddr2 == NULL) {
3626 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3627 	} else {
3628 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3629 		    (size_t)uap->uaddr1, &timeout);
3630 		if (error != 0)
3631 			return (error);
3632 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3633 	}
3634 	return (error);
3635 }
3636 
3637 static int
3638 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3639 {
3640 	struct _umtx_time timeout;
3641 	int error;
3642 
3643 	/* Allow a null timespec (wait forever). */
3644 	if (uap->uaddr2 == NULL) {
3645 		error = do_rw_wrlock(td, uap->obj, 0);
3646 	} else {
3647 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3648 		    (size_t)uap->uaddr1, &timeout);
3649 		if (error != 0)
3650 			return (error);
3651 		error = do_rw_wrlock(td, uap->obj, &timeout);
3652 	}
3653 	return (error);
3654 }
3655 
3656 static int
3657 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3658 {
3659 	struct _umtx_time *tm_p, timeout;
3660 	int error;
3661 
3662 	if (uap->uaddr2 == NULL)
3663 		tm_p = NULL;
3664 	else {
3665 		error = umtx_copyin_umtx_time32(
3666 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3667 		if (error != 0)
3668 			return (error);
3669 		tm_p = &timeout;
3670 	}
3671 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3672 }
3673 
3674 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3675 static int
3676 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3677 {
3678 	struct _umtx_time *tm_p, timeout;
3679 	int error;
3680 
3681 	/* Allow a null timespec (wait forever). */
3682 	if (uap->uaddr2 == NULL)
3683 		tm_p = NULL;
3684 	else {
3685 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3686 		    (size_t)uap->uaddr1, &timeout);
3687 		if (error != 0)
3688 			return (error);
3689 		tm_p = &timeout;
3690 	}
3691 	return (do_sem_wait(td, uap->obj, tm_p));
3692 }
3693 #endif
3694 
3695 static int
3696 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3697 {
3698 	struct _umtx_time *tm_p, timeout;
3699 	int error;
3700 
3701 	/* Allow a null timespec (wait forever). */
3702 	if (uap->uaddr2 == NULL)
3703 		tm_p = NULL;
3704 	else {
3705 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3706 		    (size_t)uap->uaddr1, &timeout);
3707 		if (error != 0)
3708 			return (error);
3709 		tm_p = &timeout;
3710 	}
3711 	return (do_sem2_wait(td, uap->obj, tm_p));
3712 }
3713 
3714 static int
3715 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3716 {
3717 	int count = uap->val;
3718 	uint32_t uaddrs[BATCH_SIZE];
3719 	uint32_t **upp = (uint32_t **)uap->obj;
3720 	int tocopy;
3721 	int error = 0;
3722 	int i, pos = 0;
3723 
3724 	while (count > 0) {
3725 		tocopy = count;
3726 		if (tocopy > BATCH_SIZE)
3727 			tocopy = BATCH_SIZE;
3728 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3729 		if (error != 0)
3730 			break;
3731 		for (i = 0; i < tocopy; ++i)
3732 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3733 				INT_MAX, 1);
3734 		count -= tocopy;
3735 		pos += tocopy;
3736 	}
3737 	return (error);
3738 }
3739 
3740 static const _umtx_op_func op_table_compat32[] = {
3741 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3742 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3743 	[UMTX_OP_WAIT]	= __umtx_op_wait_compat32,
3744 	[UMTX_OP_WAKE]	= __umtx_op_wake,
3745 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_trylock_umutex,
3746 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_lock_umutex_compat32,
3747 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3748 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
3749 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
3750 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
3751 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
3752 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
3753 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
3754 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
3755 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
3756 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
3757 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
3758 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
3759 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
3760 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3761 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
3762 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
3763 #else
3764 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
3765 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
3766 #endif
3767 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
3768 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
3769 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
3770 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
3771 };
3772 
3773 int
3774 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3775 {
3776 
3777 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
3778 		return (*op_table_compat32[uap->op])(td,
3779 		    (struct _umtx_op_args *)uap);
3780 	}
3781 	return (EINVAL);
3782 }
3783 #endif
3784 
3785 void
3786 umtx_thread_init(struct thread *td)
3787 {
3788 	td->td_umtxq = umtxq_alloc();
3789 	td->td_umtxq->uq_thread = td;
3790 }
3791 
3792 void
3793 umtx_thread_fini(struct thread *td)
3794 {
3795 	umtxq_free(td->td_umtxq);
3796 }
3797 
3798 /*
3799  * It will be called when new thread is created, e.g fork().
3800  */
3801 void
3802 umtx_thread_alloc(struct thread *td)
3803 {
3804 	struct umtx_q *uq;
3805 
3806 	uq = td->td_umtxq;
3807 	uq->uq_inherited_pri = PRI_MAX;
3808 
3809 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3810 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3811 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3812 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3813 }
3814 
3815 /*
3816  * exec() hook.
3817  */
3818 static void
3819 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3820 	struct image_params *imgp __unused)
3821 {
3822 	umtx_thread_cleanup(curthread);
3823 }
3824 
3825 /*
3826  * thread_exit() hook.
3827  */
3828 void
3829 umtx_thread_exit(struct thread *td)
3830 {
3831 	umtx_thread_cleanup(td);
3832 }
3833 
3834 /*
3835  * clean up umtx data.
3836  */
3837 static void
3838 umtx_thread_cleanup(struct thread *td)
3839 {
3840 	struct umtx_q *uq;
3841 	struct umtx_pi *pi;
3842 
3843 	if ((uq = td->td_umtxq) == NULL)
3844 		return;
3845 
3846 	mtx_lock(&umtx_lock);
3847 	uq->uq_inherited_pri = PRI_MAX;
3848 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3849 		pi->pi_owner = NULL;
3850 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3851 	}
3852 	mtx_unlock(&umtx_lock);
3853 	thread_lock(td);
3854 	sched_lend_user_prio(td, PRI_MAX);
3855 	thread_unlock(td);
3856 }
3857