xref: /freebsd/sys/kern/kern_umtx.c (revision 26a222dc0c048fc071b548eadad7b80405a1b126)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 
173 /*
174  * Don't propagate time-sharing priority, there is a security reason,
175  * a user can simply introduce PI-mutex, let thread A lock the mutex,
176  * and let another thread B block on the mutex, because B is
177  * sleeping, its priority will be boosted, this causes A's priority to
178  * be boosted via priority propagating too and will never be lowered even
179  * if it is using 100%CPU, this is unfair to other processes.
180  */
181 
182 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
183 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
184 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
185 
186 #define	GOLDEN_RATIO_PRIME	2654404609U
187 #define	UMTX_CHAINS		512
188 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
189 
190 #define	GET_SHARE(flags)	\
191     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
192 
193 #define BUSY_SPINS		200
194 
195 struct abs_timeout {
196 	int clockid;
197 	struct timespec cur;
198 	struct timespec end;
199 };
200 
201 static uma_zone_t		umtx_pi_zone;
202 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
203 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
204 static int			umtx_pi_allocated;
205 
206 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
207 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
208     &umtx_pi_allocated, 0, "Allocated umtx_pi");
209 
210 #ifdef UMTX_PROFILING
211 static long max_length;
212 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
213 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
214 #endif
215 
216 static void umtxq_sysinit(void *);
217 static void umtxq_hash(struct umtx_key *key);
218 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
219 static void umtxq_lock(struct umtx_key *key);
220 static void umtxq_unlock(struct umtx_key *key);
221 static void umtxq_busy(struct umtx_key *key);
222 static void umtxq_unbusy(struct umtx_key *key);
223 static void umtxq_insert_queue(struct umtx_q *uq, int q);
224 static void umtxq_remove_queue(struct umtx_q *uq, int q);
225 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
226 static int umtxq_count(struct umtx_key *key);
227 static struct umtx_pi *umtx_pi_alloc(int);
228 static void umtx_pi_free(struct umtx_pi *pi);
229 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
230 static void umtx_thread_cleanup(struct thread *td);
231 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
232 	struct image_params *imgp __unused);
233 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
234 
235 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
236 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
237 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
238 
239 static struct mtx umtx_lock;
240 
241 #ifdef UMTX_PROFILING
242 static void
243 umtx_init_profiling(void)
244 {
245 	struct sysctl_oid *chain_oid;
246 	char chain_name[10];
247 	int i;
248 
249 	for (i = 0; i < UMTX_CHAINS; ++i) {
250 		snprintf(chain_name, sizeof(chain_name), "%d", i);
251 		chain_oid = SYSCTL_ADD_NODE(NULL,
252 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
253 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
254 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
255 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
256 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
257 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
258 	}
259 }
260 
261 static int
262 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
263 {
264 	char buf[512];
265 	struct sbuf sb;
266 	struct umtxq_chain *uc;
267 	u_int fract, i, j, tot, whole;
268 	u_int sf0, sf1, sf2, sf3, sf4;
269 	u_int si0, si1, si2, si3, si4;
270 	u_int sw0, sw1, sw2, sw3, sw4;
271 
272 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
273 	for (i = 0; i < 2; i++) {
274 		tot = 0;
275 		for (j = 0; j < UMTX_CHAINS; ++j) {
276 			uc = &umtxq_chains[i][j];
277 			mtx_lock(&uc->uc_lock);
278 			tot += uc->max_length;
279 			mtx_unlock(&uc->uc_lock);
280 		}
281 		if (tot == 0)
282 			sbuf_printf(&sb, "%u) Empty ", i);
283 		else {
284 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
285 			si0 = si1 = si2 = si3 = si4 = 0;
286 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
287 			for (j = 0; j < UMTX_CHAINS; j++) {
288 				uc = &umtxq_chains[i][j];
289 				mtx_lock(&uc->uc_lock);
290 				whole = uc->max_length * 100;
291 				mtx_unlock(&uc->uc_lock);
292 				fract = (whole % tot) * 100;
293 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
294 					sf0 = fract;
295 					si0 = j;
296 					sw0 = whole;
297 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
298 				    sf1)) {
299 					sf1 = fract;
300 					si1 = j;
301 					sw1 = whole;
302 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
303 				    sf2)) {
304 					sf2 = fract;
305 					si2 = j;
306 					sw2 = whole;
307 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
308 				    sf3)) {
309 					sf3 = fract;
310 					si3 = j;
311 					sw3 = whole;
312 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
313 				    sf4)) {
314 					sf4 = fract;
315 					si4 = j;
316 					sw4 = whole;
317 				}
318 			}
319 			sbuf_printf(&sb, "queue %u:\n", i);
320 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
321 			    sf0 / tot, si0);
322 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
323 			    sf1 / tot, si1);
324 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
325 			    sf2 / tot, si2);
326 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
327 			    sf3 / tot, si3);
328 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
329 			    sf4 / tot, si4);
330 		}
331 	}
332 	sbuf_trim(&sb);
333 	sbuf_finish(&sb);
334 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
335 	sbuf_delete(&sb);
336 	return (0);
337 }
338 
339 static int
340 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
341 {
342 	struct umtxq_chain *uc;
343 	u_int i, j;
344 	int clear, error;
345 
346 	clear = 0;
347 	error = sysctl_handle_int(oidp, &clear, 0, req);
348 	if (error != 0 || req->newptr == NULL)
349 		return (error);
350 
351 	if (clear != 0) {
352 		for (i = 0; i < 2; ++i) {
353 			for (j = 0; j < UMTX_CHAINS; ++j) {
354 				uc = &umtxq_chains[i][j];
355 				mtx_lock(&uc->uc_lock);
356 				uc->length = 0;
357 				uc->max_length = 0;
358 				mtx_unlock(&uc->uc_lock);
359 			}
360 		}
361 	}
362 	return (0);
363 }
364 
365 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
366     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
367     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
368 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
369     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
370     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
371 #endif
372 
373 static void
374 umtxq_sysinit(void *arg __unused)
375 {
376 	int i, j;
377 
378 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
379 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
380 	for (i = 0; i < 2; ++i) {
381 		for (j = 0; j < UMTX_CHAINS; ++j) {
382 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
383 				 MTX_DEF | MTX_DUPOK);
384 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
387 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
388 			umtxq_chains[i][j].uc_busy = 0;
389 			umtxq_chains[i][j].uc_waiters = 0;
390 #ifdef UMTX_PROFILING
391 			umtxq_chains[i][j].length = 0;
392 			umtxq_chains[i][j].max_length = 0;
393 #endif
394 		}
395 	}
396 #ifdef UMTX_PROFILING
397 	umtx_init_profiling();
398 #endif
399 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
400 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
401 	    EVENTHANDLER_PRI_ANY);
402 }
403 
404 struct umtx_q *
405 umtxq_alloc(void)
406 {
407 	struct umtx_q *uq;
408 
409 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
410 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
411 	TAILQ_INIT(&uq->uq_spare_queue->head);
412 	TAILQ_INIT(&uq->uq_pi_contested);
413 	uq->uq_inherited_pri = PRI_MAX;
414 	return (uq);
415 }
416 
417 void
418 umtxq_free(struct umtx_q *uq)
419 {
420 	MPASS(uq->uq_spare_queue != NULL);
421 	free(uq->uq_spare_queue, M_UMTX);
422 	free(uq, M_UMTX);
423 }
424 
425 static inline void
426 umtxq_hash(struct umtx_key *key)
427 {
428 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
429 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
430 }
431 
432 static inline struct umtxq_chain *
433 umtxq_getchain(struct umtx_key *key)
434 {
435 	if (key->type <= TYPE_SEM)
436 		return (&umtxq_chains[1][key->hash]);
437 	return (&umtxq_chains[0][key->hash]);
438 }
439 
440 /*
441  * Lock a chain.
442  */
443 static inline void
444 umtxq_lock(struct umtx_key *key)
445 {
446 	struct umtxq_chain *uc;
447 
448 	uc = umtxq_getchain(key);
449 	mtx_lock(&uc->uc_lock);
450 }
451 
452 /*
453  * Unlock a chain.
454  */
455 static inline void
456 umtxq_unlock(struct umtx_key *key)
457 {
458 	struct umtxq_chain *uc;
459 
460 	uc = umtxq_getchain(key);
461 	mtx_unlock(&uc->uc_lock);
462 }
463 
464 /*
465  * Set chain to busy state when following operation
466  * may be blocked (kernel mutex can not be used).
467  */
468 static inline void
469 umtxq_busy(struct umtx_key *key)
470 {
471 	struct umtxq_chain *uc;
472 
473 	uc = umtxq_getchain(key);
474 	mtx_assert(&uc->uc_lock, MA_OWNED);
475 	if (uc->uc_busy) {
476 #ifdef SMP
477 		if (smp_cpus > 1) {
478 			int count = BUSY_SPINS;
479 			if (count > 0) {
480 				umtxq_unlock(key);
481 				while (uc->uc_busy && --count > 0)
482 					cpu_spinwait();
483 				umtxq_lock(key);
484 			}
485 		}
486 #endif
487 		while (uc->uc_busy) {
488 			uc->uc_waiters++;
489 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
490 			uc->uc_waiters--;
491 		}
492 	}
493 	uc->uc_busy = 1;
494 }
495 
496 /*
497  * Unbusy a chain.
498  */
499 static inline void
500 umtxq_unbusy(struct umtx_key *key)
501 {
502 	struct umtxq_chain *uc;
503 
504 	uc = umtxq_getchain(key);
505 	mtx_assert(&uc->uc_lock, MA_OWNED);
506 	KASSERT(uc->uc_busy != 0, ("not busy"));
507 	uc->uc_busy = 0;
508 	if (uc->uc_waiters)
509 		wakeup_one(uc);
510 }
511 
512 static inline void
513 umtxq_unbusy_unlocked(struct umtx_key *key)
514 {
515 
516 	umtxq_lock(key);
517 	umtxq_unbusy(key);
518 	umtxq_unlock(key);
519 }
520 
521 static struct umtxq_queue *
522 umtxq_queue_lookup(struct umtx_key *key, int q)
523 {
524 	struct umtxq_queue *uh;
525 	struct umtxq_chain *uc;
526 
527 	uc = umtxq_getchain(key);
528 	UMTXQ_LOCKED_ASSERT(uc);
529 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
530 		if (umtx_key_match(&uh->key, key))
531 			return (uh);
532 	}
533 
534 	return (NULL);
535 }
536 
537 static inline void
538 umtxq_insert_queue(struct umtx_q *uq, int q)
539 {
540 	struct umtxq_queue *uh;
541 	struct umtxq_chain *uc;
542 
543 	uc = umtxq_getchain(&uq->uq_key);
544 	UMTXQ_LOCKED_ASSERT(uc);
545 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
546 	uh = umtxq_queue_lookup(&uq->uq_key, q);
547 	if (uh != NULL) {
548 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
549 	} else {
550 		uh = uq->uq_spare_queue;
551 		uh->key = uq->uq_key;
552 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
553 #ifdef UMTX_PROFILING
554 		uc->length++;
555 		if (uc->length > uc->max_length) {
556 			uc->max_length = uc->length;
557 			if (uc->max_length > max_length)
558 				max_length = uc->max_length;
559 		}
560 #endif
561 	}
562 	uq->uq_spare_queue = NULL;
563 
564 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
565 	uh->length++;
566 	uq->uq_flags |= UQF_UMTXQ;
567 	uq->uq_cur_queue = uh;
568 	return;
569 }
570 
571 static inline void
572 umtxq_remove_queue(struct umtx_q *uq, int q)
573 {
574 	struct umtxq_chain *uc;
575 	struct umtxq_queue *uh;
576 
577 	uc = umtxq_getchain(&uq->uq_key);
578 	UMTXQ_LOCKED_ASSERT(uc);
579 	if (uq->uq_flags & UQF_UMTXQ) {
580 		uh = uq->uq_cur_queue;
581 		TAILQ_REMOVE(&uh->head, uq, uq_link);
582 		uh->length--;
583 		uq->uq_flags &= ~UQF_UMTXQ;
584 		if (TAILQ_EMPTY(&uh->head)) {
585 			KASSERT(uh->length == 0,
586 			    ("inconsistent umtxq_queue length"));
587 #ifdef UMTX_PROFILING
588 			uc->length--;
589 #endif
590 			LIST_REMOVE(uh, link);
591 		} else {
592 			uh = LIST_FIRST(&uc->uc_spare_queue);
593 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
594 			LIST_REMOVE(uh, link);
595 		}
596 		uq->uq_spare_queue = uh;
597 		uq->uq_cur_queue = NULL;
598 	}
599 }
600 
601 /*
602  * Check if there are multiple waiters
603  */
604 static int
605 umtxq_count(struct umtx_key *key)
606 {
607 	struct umtxq_chain *uc;
608 	struct umtxq_queue *uh;
609 
610 	uc = umtxq_getchain(key);
611 	UMTXQ_LOCKED_ASSERT(uc);
612 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
613 	if (uh != NULL)
614 		return (uh->length);
615 	return (0);
616 }
617 
618 /*
619  * Check if there are multiple PI waiters and returns first
620  * waiter.
621  */
622 static int
623 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
624 {
625 	struct umtxq_chain *uc;
626 	struct umtxq_queue *uh;
627 
628 	*first = NULL;
629 	uc = umtxq_getchain(key);
630 	UMTXQ_LOCKED_ASSERT(uc);
631 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
632 	if (uh != NULL) {
633 		*first = TAILQ_FIRST(&uh->head);
634 		return (uh->length);
635 	}
636 	return (0);
637 }
638 
639 static int
640 umtxq_check_susp(struct thread *td)
641 {
642 	struct proc *p;
643 	int error;
644 
645 	/*
646 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
647 	 * eventually break the lockstep loop.
648 	 */
649 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
650 		return (0);
651 	error = 0;
652 	p = td->td_proc;
653 	PROC_LOCK(p);
654 	if (P_SHOULDSTOP(p) ||
655 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
656 		if (p->p_flag & P_SINGLE_EXIT)
657 			error = EINTR;
658 		else
659 			error = ERESTART;
660 	}
661 	PROC_UNLOCK(p);
662 	return (error);
663 }
664 
665 /*
666  * Wake up threads waiting on an userland object.
667  */
668 
669 static int
670 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
671 {
672 	struct umtxq_chain *uc;
673 	struct umtxq_queue *uh;
674 	struct umtx_q *uq;
675 	int ret;
676 
677 	ret = 0;
678 	uc = umtxq_getchain(key);
679 	UMTXQ_LOCKED_ASSERT(uc);
680 	uh = umtxq_queue_lookup(key, q);
681 	if (uh != NULL) {
682 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
683 			umtxq_remove_queue(uq, q);
684 			wakeup(uq);
685 			if (++ret >= n_wake)
686 				return (ret);
687 		}
688 	}
689 	return (ret);
690 }
691 
692 
693 /*
694  * Wake up specified thread.
695  */
696 static inline void
697 umtxq_signal_thread(struct umtx_q *uq)
698 {
699 	struct umtxq_chain *uc;
700 
701 	uc = umtxq_getchain(&uq->uq_key);
702 	UMTXQ_LOCKED_ASSERT(uc);
703 	umtxq_remove(uq);
704 	wakeup(uq);
705 }
706 
707 static inline int
708 tstohz(const struct timespec *tsp)
709 {
710 	struct timeval tv;
711 
712 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
713 	return tvtohz(&tv);
714 }
715 
716 static void
717 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
718 	const struct timespec *timeout)
719 {
720 
721 	timo->clockid = clockid;
722 	if (!absolute) {
723 		kern_clock_gettime(curthread, clockid, &timo->end);
724 		timo->cur = timo->end;
725 		timespecadd(&timo->end, timeout);
726 	} else {
727 		timo->end = *timeout;
728 		kern_clock_gettime(curthread, clockid, &timo->cur);
729 	}
730 }
731 
732 static void
733 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
734 {
735 
736 	abs_timeout_init(timo, umtxtime->_clockid,
737 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
738 		&umtxtime->_timeout);
739 }
740 
741 static inline void
742 abs_timeout_update(struct abs_timeout *timo)
743 {
744 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
745 }
746 
747 static int
748 abs_timeout_gethz(struct abs_timeout *timo)
749 {
750 	struct timespec tts;
751 
752 	if (timespeccmp(&timo->end, &timo->cur, <=))
753 		return (-1);
754 	tts = timo->end;
755 	timespecsub(&tts, &timo->cur);
756 	return (tstohz(&tts));
757 }
758 
759 /*
760  * Put thread into sleep state, before sleeping, check if
761  * thread was removed from umtx queue.
762  */
763 static inline int
764 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
765 {
766 	struct umtxq_chain *uc;
767 	int error, timo;
768 
769 	uc = umtxq_getchain(&uq->uq_key);
770 	UMTXQ_LOCKED_ASSERT(uc);
771 	for (;;) {
772 		if (!(uq->uq_flags & UQF_UMTXQ))
773 			return (0);
774 		if (abstime != NULL) {
775 			timo = abs_timeout_gethz(abstime);
776 			if (timo < 0)
777 				return (ETIMEDOUT);
778 		} else
779 			timo = 0;
780 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
781 		if (error != EWOULDBLOCK) {
782 			umtxq_lock(&uq->uq_key);
783 			break;
784 		}
785 		if (abstime != NULL)
786 			abs_timeout_update(abstime);
787 		umtxq_lock(&uq->uq_key);
788 	}
789 	return (error);
790 }
791 
792 /*
793  * Convert userspace address into unique logical address.
794  */
795 int
796 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
797 {
798 	struct thread *td = curthread;
799 	vm_map_t map;
800 	vm_map_entry_t entry;
801 	vm_pindex_t pindex;
802 	vm_prot_t prot;
803 	boolean_t wired;
804 
805 	key->type = type;
806 	if (share == THREAD_SHARE) {
807 		key->shared = 0;
808 		key->info.private.vs = td->td_proc->p_vmspace;
809 		key->info.private.addr = (uintptr_t)addr;
810 	} else {
811 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
812 		map = &td->td_proc->p_vmspace->vm_map;
813 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
814 		    &entry, &key->info.shared.object, &pindex, &prot,
815 		    &wired) != KERN_SUCCESS) {
816 			return EFAULT;
817 		}
818 
819 		if ((share == PROCESS_SHARE) ||
820 		    (share == AUTO_SHARE &&
821 		     VM_INHERIT_SHARE == entry->inheritance)) {
822 			key->shared = 1;
823 			key->info.shared.offset = entry->offset + entry->start -
824 				(vm_offset_t)addr;
825 			vm_object_reference(key->info.shared.object);
826 		} else {
827 			key->shared = 0;
828 			key->info.private.vs = td->td_proc->p_vmspace;
829 			key->info.private.addr = (uintptr_t)addr;
830 		}
831 		vm_map_lookup_done(map, entry);
832 	}
833 
834 	umtxq_hash(key);
835 	return (0);
836 }
837 
838 /*
839  * Release key.
840  */
841 void
842 umtx_key_release(struct umtx_key *key)
843 {
844 	if (key->shared)
845 		vm_object_deallocate(key->info.shared.object);
846 }
847 
848 /*
849  * Fetch and compare value, sleep on the address if value is not changed.
850  */
851 static int
852 do_wait(struct thread *td, void *addr, u_long id,
853 	struct _umtx_time *timeout, int compat32, int is_private)
854 {
855 	struct abs_timeout timo;
856 	struct umtx_q *uq;
857 	u_long tmp;
858 	uint32_t tmp32;
859 	int error = 0;
860 
861 	uq = td->td_umtxq;
862 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
863 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
864 		return (error);
865 
866 	if (timeout != NULL)
867 		abs_timeout_init2(&timo, timeout);
868 
869 	umtxq_lock(&uq->uq_key);
870 	umtxq_insert(uq);
871 	umtxq_unlock(&uq->uq_key);
872 	if (compat32 == 0) {
873 		error = fueword(addr, &tmp);
874 		if (error != 0)
875 			error = EFAULT;
876 	} else {
877 		error = fueword32(addr, &tmp32);
878 		if (error == 0)
879 			tmp = tmp32;
880 		else
881 			error = EFAULT;
882 	}
883 	umtxq_lock(&uq->uq_key);
884 	if (error == 0) {
885 		if (tmp == id)
886 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
887 			    NULL : &timo);
888 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
889 			error = 0;
890 		else
891 			umtxq_remove(uq);
892 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
893 		umtxq_remove(uq);
894 	}
895 	umtxq_unlock(&uq->uq_key);
896 	umtx_key_release(&uq->uq_key);
897 	if (error == ERESTART)
898 		error = EINTR;
899 	return (error);
900 }
901 
902 /*
903  * Wake up threads sleeping on the specified address.
904  */
905 int
906 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
907 {
908 	struct umtx_key key;
909 	int ret;
910 
911 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
912 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
913 		return (ret);
914 	umtxq_lock(&key);
915 	ret = umtxq_signal(&key, n_wake);
916 	umtxq_unlock(&key);
917 	umtx_key_release(&key);
918 	return (0);
919 }
920 
921 /*
922  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
923  */
924 static int
925 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
926 	struct _umtx_time *timeout, int mode)
927 {
928 	struct abs_timeout timo;
929 	struct umtx_q *uq;
930 	uint32_t owner, old, id;
931 	int error, rv;
932 
933 	id = td->td_tid;
934 	uq = td->td_umtxq;
935 	error = 0;
936 	if (timeout != NULL)
937 		abs_timeout_init2(&timo, timeout);
938 
939 	/*
940 	 * Care must be exercised when dealing with umtx structure. It
941 	 * can fault on any access.
942 	 */
943 	for (;;) {
944 		rv = fueword32(&m->m_owner, &owner);
945 		if (rv == -1)
946 			return (EFAULT);
947 		if (mode == _UMUTEX_WAIT) {
948 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
949 				return (0);
950 		} else {
951 			/*
952 			 * Try the uncontested case.  This should be done in userland.
953 			 */
954 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
955 			    &owner, id);
956 			/* The address was invalid. */
957 			if (rv == -1)
958 				return (EFAULT);
959 
960 			/* The acquire succeeded. */
961 			if (owner == UMUTEX_UNOWNED)
962 				return (0);
963 
964 			/* If no one owns it but it is contested try to acquire it. */
965 			if (owner == UMUTEX_CONTESTED) {
966 				rv = casueword32(&m->m_owner,
967 				    UMUTEX_CONTESTED, &owner,
968 				    id | UMUTEX_CONTESTED);
969 				/* The address was invalid. */
970 				if (rv == -1)
971 					return (EFAULT);
972 
973 				if (owner == UMUTEX_CONTESTED)
974 					return (0);
975 
976 				rv = umtxq_check_susp(td);
977 				if (rv != 0)
978 					return (rv);
979 
980 				/* If this failed the lock has changed, restart. */
981 				continue;
982 			}
983 		}
984 
985 		if (mode == _UMUTEX_TRY)
986 			return (EBUSY);
987 
988 		/*
989 		 * If we caught a signal, we have retried and now
990 		 * exit immediately.
991 		 */
992 		if (error != 0)
993 			return (error);
994 
995 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
996 		    GET_SHARE(flags), &uq->uq_key)) != 0)
997 			return (error);
998 
999 		umtxq_lock(&uq->uq_key);
1000 		umtxq_busy(&uq->uq_key);
1001 		umtxq_insert(uq);
1002 		umtxq_unlock(&uq->uq_key);
1003 
1004 		/*
1005 		 * Set the contested bit so that a release in user space
1006 		 * knows to use the system call for unlock.  If this fails
1007 		 * either some one else has acquired the lock or it has been
1008 		 * released.
1009 		 */
1010 		rv = casueword32(&m->m_owner, owner, &old,
1011 		    owner | UMUTEX_CONTESTED);
1012 
1013 		/* The address was invalid. */
1014 		if (rv == -1) {
1015 			umtxq_lock(&uq->uq_key);
1016 			umtxq_remove(uq);
1017 			umtxq_unbusy(&uq->uq_key);
1018 			umtxq_unlock(&uq->uq_key);
1019 			umtx_key_release(&uq->uq_key);
1020 			return (EFAULT);
1021 		}
1022 
1023 		/*
1024 		 * We set the contested bit, sleep. Otherwise the lock changed
1025 		 * and we need to retry or we lost a race to the thread
1026 		 * unlocking the umtx.
1027 		 */
1028 		umtxq_lock(&uq->uq_key);
1029 		umtxq_unbusy(&uq->uq_key);
1030 		if (old == owner)
1031 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1032 			    NULL : &timo);
1033 		umtxq_remove(uq);
1034 		umtxq_unlock(&uq->uq_key);
1035 		umtx_key_release(&uq->uq_key);
1036 
1037 		if (error == 0)
1038 			error = umtxq_check_susp(td);
1039 	}
1040 
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1046  */
1047 static int
1048 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1049 {
1050 	struct umtx_key key;
1051 	uint32_t owner, old, id;
1052 	int error;
1053 	int count;
1054 
1055 	id = td->td_tid;
1056 	/*
1057 	 * Make sure we own this mtx.
1058 	 */
1059 	error = fueword32(&m->m_owner, &owner);
1060 	if (error == -1)
1061 		return (EFAULT);
1062 
1063 	if ((owner & ~UMUTEX_CONTESTED) != id)
1064 		return (EPERM);
1065 
1066 	if ((owner & UMUTEX_CONTESTED) == 0) {
1067 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1068 		if (error == -1)
1069 			return (EFAULT);
1070 		if (old == owner)
1071 			return (0);
1072 		owner = old;
1073 	}
1074 
1075 	/* We should only ever be in here for contested locks */
1076 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1077 	    &key)) != 0)
1078 		return (error);
1079 
1080 	umtxq_lock(&key);
1081 	umtxq_busy(&key);
1082 	count = umtxq_count(&key);
1083 	umtxq_unlock(&key);
1084 
1085 	/*
1086 	 * When unlocking the umtx, it must be marked as unowned if
1087 	 * there is zero or one thread only waiting for it.
1088 	 * Otherwise, it must be marked as contested.
1089 	 */
1090 	error = casueword32(&m->m_owner, owner, &old,
1091 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1092 	umtxq_lock(&key);
1093 	umtxq_signal(&key,1);
1094 	umtxq_unbusy(&key);
1095 	umtxq_unlock(&key);
1096 	umtx_key_release(&key);
1097 	if (error == -1)
1098 		return (EFAULT);
1099 	if (old != owner)
1100 		return (EINVAL);
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Check if the mutex is available and wake up a waiter,
1106  * only for simple mutex.
1107  */
1108 static int
1109 do_wake_umutex(struct thread *td, struct umutex *m)
1110 {
1111 	struct umtx_key key;
1112 	uint32_t owner;
1113 	uint32_t flags;
1114 	int error;
1115 	int count;
1116 
1117 	error = fueword32(&m->m_owner, &owner);
1118 	if (error == -1)
1119 		return (EFAULT);
1120 
1121 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1122 		return (0);
1123 
1124 	error = fueword32(&m->m_flags, &flags);
1125 	if (error == -1)
1126 		return (EFAULT);
1127 
1128 	/* We should only ever be in here for contested locks */
1129 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1130 	    &key)) != 0)
1131 		return (error);
1132 
1133 	umtxq_lock(&key);
1134 	umtxq_busy(&key);
1135 	count = umtxq_count(&key);
1136 	umtxq_unlock(&key);
1137 
1138 	if (count <= 1) {
1139 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1140 		    UMUTEX_UNOWNED);
1141 		if (error == -1)
1142 			error = EFAULT;
1143 	}
1144 
1145 	umtxq_lock(&key);
1146 	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1147 		umtxq_signal(&key, 1);
1148 	umtxq_unbusy(&key);
1149 	umtxq_unlock(&key);
1150 	umtx_key_release(&key);
1151 	return (error);
1152 }
1153 
1154 /*
1155  * Check if the mutex has waiters and tries to fix contention bit.
1156  */
1157 static int
1158 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1159 {
1160 	struct umtx_key key;
1161 	uint32_t owner, old;
1162 	int type;
1163 	int error;
1164 	int count;
1165 
1166 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1167 	case 0:
1168 		type = TYPE_NORMAL_UMUTEX;
1169 		break;
1170 	case UMUTEX_PRIO_INHERIT:
1171 		type = TYPE_PI_UMUTEX;
1172 		break;
1173 	case UMUTEX_PRIO_PROTECT:
1174 		type = TYPE_PP_UMUTEX;
1175 		break;
1176 	default:
1177 		return (EINVAL);
1178 	}
1179 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1180 	    &key)) != 0)
1181 		return (error);
1182 
1183 	owner = 0;
1184 	umtxq_lock(&key);
1185 	umtxq_busy(&key);
1186 	count = umtxq_count(&key);
1187 	umtxq_unlock(&key);
1188 	/*
1189 	 * Only repair contention bit if there is a waiter, this means the mutex
1190 	 * is still being referenced by userland code, otherwise don't update
1191 	 * any memory.
1192 	 */
1193 	if (count > 1) {
1194 		error = fueword32(&m->m_owner, &owner);
1195 		if (error == -1)
1196 			error = EFAULT;
1197 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1198 			error = casueword32(&m->m_owner, owner, &old,
1199 			    owner | UMUTEX_CONTESTED);
1200 			if (error == -1) {
1201 				error = EFAULT;
1202 				break;
1203 			}
1204 			if (old == owner)
1205 				break;
1206 			owner = old;
1207 			error = umtxq_check_susp(td);
1208 			if (error != 0)
1209 				break;
1210 		}
1211 	} else if (count == 1) {
1212 		error = fueword32(&m->m_owner, &owner);
1213 		if (error == -1)
1214 			error = EFAULT;
1215 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1216 		       (owner & UMUTEX_CONTESTED) == 0) {
1217 			error = casueword32(&m->m_owner, owner, &old,
1218 			    owner | UMUTEX_CONTESTED);
1219 			if (error == -1) {
1220 				error = EFAULT;
1221 				break;
1222 			}
1223 			if (old == owner)
1224 				break;
1225 			owner = old;
1226 			error = umtxq_check_susp(td);
1227 			if (error != 0)
1228 				break;
1229 		}
1230 	}
1231 	umtxq_lock(&key);
1232 	if (error == EFAULT) {
1233 		umtxq_signal(&key, INT_MAX);
1234 	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1235 		umtxq_signal(&key, 1);
1236 	umtxq_unbusy(&key);
1237 	umtxq_unlock(&key);
1238 	umtx_key_release(&key);
1239 	return (error);
1240 }
1241 
1242 static inline struct umtx_pi *
1243 umtx_pi_alloc(int flags)
1244 {
1245 	struct umtx_pi *pi;
1246 
1247 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1248 	TAILQ_INIT(&pi->pi_blocked);
1249 	atomic_add_int(&umtx_pi_allocated, 1);
1250 	return (pi);
1251 }
1252 
1253 static inline void
1254 umtx_pi_free(struct umtx_pi *pi)
1255 {
1256 	uma_zfree(umtx_pi_zone, pi);
1257 	atomic_add_int(&umtx_pi_allocated, -1);
1258 }
1259 
1260 /*
1261  * Adjust the thread's position on a pi_state after its priority has been
1262  * changed.
1263  */
1264 static int
1265 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1266 {
1267 	struct umtx_q *uq, *uq1, *uq2;
1268 	struct thread *td1;
1269 
1270 	mtx_assert(&umtx_lock, MA_OWNED);
1271 	if (pi == NULL)
1272 		return (0);
1273 
1274 	uq = td->td_umtxq;
1275 
1276 	/*
1277 	 * Check if the thread needs to be moved on the blocked chain.
1278 	 * It needs to be moved if either its priority is lower than
1279 	 * the previous thread or higher than the next thread.
1280 	 */
1281 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1282 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1283 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1284 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1285 		/*
1286 		 * Remove thread from blocked chain and determine where
1287 		 * it should be moved to.
1288 		 */
1289 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1290 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1291 			td1 = uq1->uq_thread;
1292 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1293 			if (UPRI(td1) > UPRI(td))
1294 				break;
1295 		}
1296 
1297 		if (uq1 == NULL)
1298 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1299 		else
1300 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1301 	}
1302 	return (1);
1303 }
1304 
1305 static struct umtx_pi *
1306 umtx_pi_next(struct umtx_pi *pi)
1307 {
1308 	struct umtx_q *uq_owner;
1309 
1310 	if (pi->pi_owner == NULL)
1311 		return (NULL);
1312 	uq_owner = pi->pi_owner->td_umtxq;
1313 	if (uq_owner == NULL)
1314 		return (NULL);
1315 	return (uq_owner->uq_pi_blocked);
1316 }
1317 
1318 /*
1319  * Floyd's Cycle-Finding Algorithm.
1320  */
1321 static bool
1322 umtx_pi_check_loop(struct umtx_pi *pi)
1323 {
1324 	struct umtx_pi *pi1;	/* fast iterator */
1325 
1326 	mtx_assert(&umtx_lock, MA_OWNED);
1327 	if (pi == NULL)
1328 		return (false);
1329 	pi1 = pi;
1330 	for (;;) {
1331 		pi = umtx_pi_next(pi);
1332 		if (pi == NULL)
1333 			break;
1334 		pi1 = umtx_pi_next(pi1);
1335 		if (pi1 == NULL)
1336 			break;
1337 		pi1 = umtx_pi_next(pi1);
1338 		if (pi1 == NULL)
1339 			break;
1340 		if (pi == pi1)
1341 			return (true);
1342 	}
1343 	return (false);
1344 }
1345 
1346 /*
1347  * Propagate priority when a thread is blocked on POSIX
1348  * PI mutex.
1349  */
1350 static void
1351 umtx_propagate_priority(struct thread *td)
1352 {
1353 	struct umtx_q *uq;
1354 	struct umtx_pi *pi;
1355 	int pri;
1356 
1357 	mtx_assert(&umtx_lock, MA_OWNED);
1358 	pri = UPRI(td);
1359 	uq = td->td_umtxq;
1360 	pi = uq->uq_pi_blocked;
1361 	if (pi == NULL)
1362 		return;
1363 	if (umtx_pi_check_loop(pi))
1364 		return;
1365 
1366 	for (;;) {
1367 		td = pi->pi_owner;
1368 		if (td == NULL || td == curthread)
1369 			return;
1370 
1371 		MPASS(td->td_proc != NULL);
1372 		MPASS(td->td_proc->p_magic == P_MAGIC);
1373 
1374 		thread_lock(td);
1375 		if (td->td_lend_user_pri > pri)
1376 			sched_lend_user_prio(td, pri);
1377 		else {
1378 			thread_unlock(td);
1379 			break;
1380 		}
1381 		thread_unlock(td);
1382 
1383 		/*
1384 		 * Pick up the lock that td is blocked on.
1385 		 */
1386 		uq = td->td_umtxq;
1387 		pi = uq->uq_pi_blocked;
1388 		if (pi == NULL)
1389 			break;
1390 		/* Resort td on the list if needed. */
1391 		umtx_pi_adjust_thread(pi, td);
1392 	}
1393 }
1394 
1395 /*
1396  * Unpropagate priority for a PI mutex when a thread blocked on
1397  * it is interrupted by signal or resumed by others.
1398  */
1399 static void
1400 umtx_repropagate_priority(struct umtx_pi *pi)
1401 {
1402 	struct umtx_q *uq, *uq_owner;
1403 	struct umtx_pi *pi2;
1404 	int pri;
1405 
1406 	mtx_assert(&umtx_lock, MA_OWNED);
1407 
1408 	if (umtx_pi_check_loop(pi))
1409 		return;
1410 	while (pi != NULL && pi->pi_owner != NULL) {
1411 		pri = PRI_MAX;
1412 		uq_owner = pi->pi_owner->td_umtxq;
1413 
1414 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1415 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1416 			if (uq != NULL) {
1417 				if (pri > UPRI(uq->uq_thread))
1418 					pri = UPRI(uq->uq_thread);
1419 			}
1420 		}
1421 
1422 		if (pri > uq_owner->uq_inherited_pri)
1423 			pri = uq_owner->uq_inherited_pri;
1424 		thread_lock(pi->pi_owner);
1425 		sched_lend_user_prio(pi->pi_owner, pri);
1426 		thread_unlock(pi->pi_owner);
1427 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1428 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1429 	}
1430 }
1431 
1432 /*
1433  * Insert a PI mutex into owned list.
1434  */
1435 static void
1436 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1437 {
1438 	struct umtx_q *uq_owner;
1439 
1440 	uq_owner = owner->td_umtxq;
1441 	mtx_assert(&umtx_lock, MA_OWNED);
1442 	if (pi->pi_owner != NULL)
1443 		panic("pi_ower != NULL");
1444 	pi->pi_owner = owner;
1445 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1446 }
1447 
1448 
1449 /*
1450  * Disown a PI mutex, and remove it from the owned list.
1451  */
1452 static void
1453 umtx_pi_disown(struct umtx_pi *pi)
1454 {
1455 
1456 	mtx_assert(&umtx_lock, MA_OWNED);
1457 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1458 	pi->pi_owner = NULL;
1459 }
1460 
1461 /*
1462  * Claim ownership of a PI mutex.
1463  */
1464 static int
1465 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1466 {
1467 	struct umtx_q *uq, *uq_owner;
1468 
1469 	uq_owner = owner->td_umtxq;
1470 	mtx_lock(&umtx_lock);
1471 	if (pi->pi_owner == owner) {
1472 		mtx_unlock(&umtx_lock);
1473 		return (0);
1474 	}
1475 
1476 	if (pi->pi_owner != NULL) {
1477 		/*
1478 		 * userland may have already messed the mutex, sigh.
1479 		 */
1480 		mtx_unlock(&umtx_lock);
1481 		return (EPERM);
1482 	}
1483 	umtx_pi_setowner(pi, owner);
1484 	uq = TAILQ_FIRST(&pi->pi_blocked);
1485 	if (uq != NULL) {
1486 		int pri;
1487 
1488 		pri = UPRI(uq->uq_thread);
1489 		thread_lock(owner);
1490 		if (pri < UPRI(owner))
1491 			sched_lend_user_prio(owner, pri);
1492 		thread_unlock(owner);
1493 	}
1494 	mtx_unlock(&umtx_lock);
1495 	return (0);
1496 }
1497 
1498 /*
1499  * Adjust a thread's order position in its blocked PI mutex,
1500  * this may result new priority propagating process.
1501  */
1502 void
1503 umtx_pi_adjust(struct thread *td, u_char oldpri)
1504 {
1505 	struct umtx_q *uq;
1506 	struct umtx_pi *pi;
1507 
1508 	uq = td->td_umtxq;
1509 	mtx_lock(&umtx_lock);
1510 	/*
1511 	 * Pick up the lock that td is blocked on.
1512 	 */
1513 	pi = uq->uq_pi_blocked;
1514 	if (pi != NULL) {
1515 		umtx_pi_adjust_thread(pi, td);
1516 		umtx_repropagate_priority(pi);
1517 	}
1518 	mtx_unlock(&umtx_lock);
1519 }
1520 
1521 /*
1522  * Sleep on a PI mutex.
1523  */
1524 static int
1525 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1526 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1527 {
1528 	struct umtxq_chain *uc;
1529 	struct thread *td, *td1;
1530 	struct umtx_q *uq1;
1531 	int pri;
1532 	int error = 0;
1533 
1534 	td = uq->uq_thread;
1535 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1536 	uc = umtxq_getchain(&uq->uq_key);
1537 	UMTXQ_LOCKED_ASSERT(uc);
1538 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1539 	umtxq_insert(uq);
1540 	mtx_lock(&umtx_lock);
1541 	if (pi->pi_owner == NULL) {
1542 		mtx_unlock(&umtx_lock);
1543 		/* XXX Only look up thread in current process. */
1544 		td1 = tdfind(owner, curproc->p_pid);
1545 		mtx_lock(&umtx_lock);
1546 		if (td1 != NULL) {
1547 			if (pi->pi_owner == NULL)
1548 				umtx_pi_setowner(pi, td1);
1549 			PROC_UNLOCK(td1->td_proc);
1550 		}
1551 	}
1552 
1553 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1554 		pri = UPRI(uq1->uq_thread);
1555 		if (pri > UPRI(td))
1556 			break;
1557 	}
1558 
1559 	if (uq1 != NULL)
1560 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1561 	else
1562 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1563 
1564 	uq->uq_pi_blocked = pi;
1565 	thread_lock(td);
1566 	td->td_flags |= TDF_UPIBLOCKED;
1567 	thread_unlock(td);
1568 	umtx_propagate_priority(td);
1569 	mtx_unlock(&umtx_lock);
1570 	umtxq_unbusy(&uq->uq_key);
1571 
1572 	error = umtxq_sleep(uq, wmesg, timo);
1573 	umtxq_remove(uq);
1574 
1575 	mtx_lock(&umtx_lock);
1576 	uq->uq_pi_blocked = NULL;
1577 	thread_lock(td);
1578 	td->td_flags &= ~TDF_UPIBLOCKED;
1579 	thread_unlock(td);
1580 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1581 	umtx_repropagate_priority(pi);
1582 	mtx_unlock(&umtx_lock);
1583 	umtxq_unlock(&uq->uq_key);
1584 
1585 	return (error);
1586 }
1587 
1588 /*
1589  * Add reference count for a PI mutex.
1590  */
1591 static void
1592 umtx_pi_ref(struct umtx_pi *pi)
1593 {
1594 	struct umtxq_chain *uc;
1595 
1596 	uc = umtxq_getchain(&pi->pi_key);
1597 	UMTXQ_LOCKED_ASSERT(uc);
1598 	pi->pi_refcount++;
1599 }
1600 
1601 /*
1602  * Decrease reference count for a PI mutex, if the counter
1603  * is decreased to zero, its memory space is freed.
1604  */
1605 static void
1606 umtx_pi_unref(struct umtx_pi *pi)
1607 {
1608 	struct umtxq_chain *uc;
1609 
1610 	uc = umtxq_getchain(&pi->pi_key);
1611 	UMTXQ_LOCKED_ASSERT(uc);
1612 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1613 	if (--pi->pi_refcount == 0) {
1614 		mtx_lock(&umtx_lock);
1615 		if (pi->pi_owner != NULL) {
1616 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1617 				pi, pi_link);
1618 			pi->pi_owner = NULL;
1619 		}
1620 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1621 			("blocked queue not empty"));
1622 		mtx_unlock(&umtx_lock);
1623 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1624 		umtx_pi_free(pi);
1625 	}
1626 }
1627 
1628 /*
1629  * Find a PI mutex in hash table.
1630  */
1631 static struct umtx_pi *
1632 umtx_pi_lookup(struct umtx_key *key)
1633 {
1634 	struct umtxq_chain *uc;
1635 	struct umtx_pi *pi;
1636 
1637 	uc = umtxq_getchain(key);
1638 	UMTXQ_LOCKED_ASSERT(uc);
1639 
1640 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1641 		if (umtx_key_match(&pi->pi_key, key)) {
1642 			return (pi);
1643 		}
1644 	}
1645 	return (NULL);
1646 }
1647 
1648 /*
1649  * Insert a PI mutex into hash table.
1650  */
1651 static inline void
1652 umtx_pi_insert(struct umtx_pi *pi)
1653 {
1654 	struct umtxq_chain *uc;
1655 
1656 	uc = umtxq_getchain(&pi->pi_key);
1657 	UMTXQ_LOCKED_ASSERT(uc);
1658 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1659 }
1660 
1661 /*
1662  * Lock a PI mutex.
1663  */
1664 static int
1665 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1666     struct _umtx_time *timeout, int try)
1667 {
1668 	struct abs_timeout timo;
1669 	struct umtx_q *uq;
1670 	struct umtx_pi *pi, *new_pi;
1671 	uint32_t id, owner, old;
1672 	int error, rv;
1673 
1674 	id = td->td_tid;
1675 	uq = td->td_umtxq;
1676 
1677 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1678 	    &uq->uq_key)) != 0)
1679 		return (error);
1680 
1681 	if (timeout != NULL)
1682 		abs_timeout_init2(&timo, timeout);
1683 
1684 	umtxq_lock(&uq->uq_key);
1685 	pi = umtx_pi_lookup(&uq->uq_key);
1686 	if (pi == NULL) {
1687 		new_pi = umtx_pi_alloc(M_NOWAIT);
1688 		if (new_pi == NULL) {
1689 			umtxq_unlock(&uq->uq_key);
1690 			new_pi = umtx_pi_alloc(M_WAITOK);
1691 			umtxq_lock(&uq->uq_key);
1692 			pi = umtx_pi_lookup(&uq->uq_key);
1693 			if (pi != NULL) {
1694 				umtx_pi_free(new_pi);
1695 				new_pi = NULL;
1696 			}
1697 		}
1698 		if (new_pi != NULL) {
1699 			new_pi->pi_key = uq->uq_key;
1700 			umtx_pi_insert(new_pi);
1701 			pi = new_pi;
1702 		}
1703 	}
1704 	umtx_pi_ref(pi);
1705 	umtxq_unlock(&uq->uq_key);
1706 
1707 	/*
1708 	 * Care must be exercised when dealing with umtx structure.  It
1709 	 * can fault on any access.
1710 	 */
1711 	for (;;) {
1712 		/*
1713 		 * Try the uncontested case.  This should be done in userland.
1714 		 */
1715 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1716 		/* The address was invalid. */
1717 		if (rv == -1) {
1718 			error = EFAULT;
1719 			break;
1720 		}
1721 
1722 		/* The acquire succeeded. */
1723 		if (owner == UMUTEX_UNOWNED) {
1724 			error = 0;
1725 			break;
1726 		}
1727 
1728 		/* If no one owns it but it is contested try to acquire it. */
1729 		if (owner == UMUTEX_CONTESTED) {
1730 			rv = casueword32(&m->m_owner,
1731 			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1732 			/* The address was invalid. */
1733 			if (rv == -1) {
1734 				error = EFAULT;
1735 				break;
1736 			}
1737 
1738 			if (owner == UMUTEX_CONTESTED) {
1739 				umtxq_lock(&uq->uq_key);
1740 				umtxq_busy(&uq->uq_key);
1741 				error = umtx_pi_claim(pi, td);
1742 				umtxq_unbusy(&uq->uq_key);
1743 				umtxq_unlock(&uq->uq_key);
1744 				if (error != 0) {
1745 					/*
1746 					 * Since we're going to return an
1747 					 * error, restore the m_owner to its
1748 					 * previous, unowned state to avoid
1749 					 * compounding the problem.
1750 					 */
1751 					(void)casuword32(&m->m_owner,
1752 					    id | UMUTEX_CONTESTED,
1753 					    UMUTEX_CONTESTED);
1754 				}
1755 				break;
1756 			}
1757 
1758 			error = umtxq_check_susp(td);
1759 			if (error != 0)
1760 				break;
1761 
1762 			/* If this failed the lock has changed, restart. */
1763 			continue;
1764 		}
1765 
1766 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1767 			error = EDEADLK;
1768 			break;
1769 		}
1770 
1771 		if (try != 0) {
1772 			error = EBUSY;
1773 			break;
1774 		}
1775 
1776 		/*
1777 		 * If we caught a signal, we have retried and now
1778 		 * exit immediately.
1779 		 */
1780 		if (error != 0)
1781 			break;
1782 
1783 		umtxq_lock(&uq->uq_key);
1784 		umtxq_busy(&uq->uq_key);
1785 		umtxq_unlock(&uq->uq_key);
1786 
1787 		/*
1788 		 * Set the contested bit so that a release in user space
1789 		 * knows to use the system call for unlock.  If this fails
1790 		 * either some one else has acquired the lock or it has been
1791 		 * released.
1792 		 */
1793 		rv = casueword32(&m->m_owner, owner, &old,
1794 		    owner | UMUTEX_CONTESTED);
1795 
1796 		/* The address was invalid. */
1797 		if (rv == -1) {
1798 			umtxq_unbusy_unlocked(&uq->uq_key);
1799 			error = EFAULT;
1800 			break;
1801 		}
1802 
1803 		umtxq_lock(&uq->uq_key);
1804 		/*
1805 		 * We set the contested bit, sleep. Otherwise the lock changed
1806 		 * and we need to retry or we lost a race to the thread
1807 		 * unlocking the umtx.
1808 		 */
1809 		if (old == owner) {
1810 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1811 			    "umtxpi", timeout == NULL ? NULL : &timo);
1812 			if (error != 0)
1813 				continue;
1814 		} else {
1815 			umtxq_unbusy(&uq->uq_key);
1816 			umtxq_unlock(&uq->uq_key);
1817 		}
1818 
1819 		error = umtxq_check_susp(td);
1820 		if (error != 0)
1821 			break;
1822 	}
1823 
1824 	umtxq_lock(&uq->uq_key);
1825 	umtx_pi_unref(pi);
1826 	umtxq_unlock(&uq->uq_key);
1827 
1828 	umtx_key_release(&uq->uq_key);
1829 	return (error);
1830 }
1831 
1832 /*
1833  * Unlock a PI mutex.
1834  */
1835 static int
1836 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1837 {
1838 	struct umtx_key key;
1839 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1840 	struct umtx_pi *pi, *pi2;
1841 	uint32_t owner, old, id;
1842 	int error;
1843 	int count;
1844 	int pri;
1845 
1846 	id = td->td_tid;
1847 	/*
1848 	 * Make sure we own this mtx.
1849 	 */
1850 	error = fueword32(&m->m_owner, &owner);
1851 	if (error == -1)
1852 		return (EFAULT);
1853 
1854 	if ((owner & ~UMUTEX_CONTESTED) != id)
1855 		return (EPERM);
1856 
1857 	/* This should be done in userland */
1858 	if ((owner & UMUTEX_CONTESTED) == 0) {
1859 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1860 		if (error == -1)
1861 			return (EFAULT);
1862 		if (old == owner)
1863 			return (0);
1864 		owner = old;
1865 	}
1866 
1867 	/* We should only ever be in here for contested locks */
1868 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1869 	    &key)) != 0)
1870 		return (error);
1871 
1872 	umtxq_lock(&key);
1873 	umtxq_busy(&key);
1874 	count = umtxq_count_pi(&key, &uq_first);
1875 	if (uq_first != NULL) {
1876 		mtx_lock(&umtx_lock);
1877 		pi = uq_first->uq_pi_blocked;
1878 		KASSERT(pi != NULL, ("pi == NULL?"));
1879 		if (pi->pi_owner != curthread) {
1880 			mtx_unlock(&umtx_lock);
1881 			umtxq_unbusy(&key);
1882 			umtxq_unlock(&key);
1883 			umtx_key_release(&key);
1884 			/* userland messed the mutex */
1885 			return (EPERM);
1886 		}
1887 		uq_me = curthread->td_umtxq;
1888 		umtx_pi_disown(pi);
1889 		/* get highest priority thread which is still sleeping. */
1890 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1891 		while (uq_first != NULL &&
1892 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1893 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1894 		}
1895 		pri = PRI_MAX;
1896 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1897 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1898 			if (uq_first2 != NULL) {
1899 				if (pri > UPRI(uq_first2->uq_thread))
1900 					pri = UPRI(uq_first2->uq_thread);
1901 			}
1902 		}
1903 		thread_lock(curthread);
1904 		sched_lend_user_prio(curthread, pri);
1905 		thread_unlock(curthread);
1906 		mtx_unlock(&umtx_lock);
1907 		if (uq_first)
1908 			umtxq_signal_thread(uq_first);
1909 	} else {
1910 		pi = umtx_pi_lookup(&key);
1911 		/*
1912 		 * A umtx_pi can exist if a signal or timeout removed the
1913 		 * last waiter from the umtxq, but there is still
1914 		 * a thread in do_lock_pi() holding the umtx_pi.
1915 		 */
1916 		if (pi != NULL) {
1917 			/*
1918 			 * The umtx_pi can be unowned, such as when a thread
1919 			 * has just entered do_lock_pi(), allocated the
1920 			 * umtx_pi, and unlocked the umtxq.
1921 			 * If the current thread owns it, it must disown it.
1922 			 */
1923 			mtx_lock(&umtx_lock);
1924 			if (pi->pi_owner == td)
1925 				umtx_pi_disown(pi);
1926 			mtx_unlock(&umtx_lock);
1927 		}
1928 	}
1929 	umtxq_unlock(&key);
1930 
1931 	/*
1932 	 * When unlocking the umtx, it must be marked as unowned if
1933 	 * there is zero or one thread only waiting for it.
1934 	 * Otherwise, it must be marked as contested.
1935 	 */
1936 	error = casueword32(&m->m_owner, owner, &old,
1937 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1938 
1939 	umtxq_unbusy_unlocked(&key);
1940 	umtx_key_release(&key);
1941 	if (error == -1)
1942 		return (EFAULT);
1943 	if (old != owner)
1944 		return (EINVAL);
1945 	return (0);
1946 }
1947 
1948 /*
1949  * Lock a PP mutex.
1950  */
1951 static int
1952 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1953     struct _umtx_time *timeout, int try)
1954 {
1955 	struct abs_timeout timo;
1956 	struct umtx_q *uq, *uq2;
1957 	struct umtx_pi *pi;
1958 	uint32_t ceiling;
1959 	uint32_t owner, id;
1960 	int error, pri, old_inherited_pri, su, rv;
1961 
1962 	id = td->td_tid;
1963 	uq = td->td_umtxq;
1964 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1965 	    &uq->uq_key)) != 0)
1966 		return (error);
1967 
1968 	if (timeout != NULL)
1969 		abs_timeout_init2(&timo, timeout);
1970 
1971 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1972 	for (;;) {
1973 		old_inherited_pri = uq->uq_inherited_pri;
1974 		umtxq_lock(&uq->uq_key);
1975 		umtxq_busy(&uq->uq_key);
1976 		umtxq_unlock(&uq->uq_key);
1977 
1978 		rv = fueword32(&m->m_ceilings[0], &ceiling);
1979 		if (rv == -1) {
1980 			error = EFAULT;
1981 			goto out;
1982 		}
1983 		ceiling = RTP_PRIO_MAX - ceiling;
1984 		if (ceiling > RTP_PRIO_MAX) {
1985 			error = EINVAL;
1986 			goto out;
1987 		}
1988 
1989 		mtx_lock(&umtx_lock);
1990 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1991 			mtx_unlock(&umtx_lock);
1992 			error = EINVAL;
1993 			goto out;
1994 		}
1995 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1996 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1997 			thread_lock(td);
1998 			if (uq->uq_inherited_pri < UPRI(td))
1999 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2000 			thread_unlock(td);
2001 		}
2002 		mtx_unlock(&umtx_lock);
2003 
2004 		rv = casueword32(&m->m_owner,
2005 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2006 		/* The address was invalid. */
2007 		if (rv == -1) {
2008 			error = EFAULT;
2009 			break;
2010 		}
2011 
2012 		if (owner == UMUTEX_CONTESTED) {
2013 			error = 0;
2014 			break;
2015 		}
2016 
2017 		if (try != 0) {
2018 			error = EBUSY;
2019 			break;
2020 		}
2021 
2022 		/*
2023 		 * If we caught a signal, we have retried and now
2024 		 * exit immediately.
2025 		 */
2026 		if (error != 0)
2027 			break;
2028 
2029 		umtxq_lock(&uq->uq_key);
2030 		umtxq_insert(uq);
2031 		umtxq_unbusy(&uq->uq_key);
2032 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2033 		    NULL : &timo);
2034 		umtxq_remove(uq);
2035 		umtxq_unlock(&uq->uq_key);
2036 
2037 		mtx_lock(&umtx_lock);
2038 		uq->uq_inherited_pri = old_inherited_pri;
2039 		pri = PRI_MAX;
2040 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2041 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2042 			if (uq2 != NULL) {
2043 				if (pri > UPRI(uq2->uq_thread))
2044 					pri = UPRI(uq2->uq_thread);
2045 			}
2046 		}
2047 		if (pri > uq->uq_inherited_pri)
2048 			pri = uq->uq_inherited_pri;
2049 		thread_lock(td);
2050 		sched_lend_user_prio(td, pri);
2051 		thread_unlock(td);
2052 		mtx_unlock(&umtx_lock);
2053 	}
2054 
2055 	if (error != 0) {
2056 		mtx_lock(&umtx_lock);
2057 		uq->uq_inherited_pri = old_inherited_pri;
2058 		pri = PRI_MAX;
2059 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2060 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2061 			if (uq2 != NULL) {
2062 				if (pri > UPRI(uq2->uq_thread))
2063 					pri = UPRI(uq2->uq_thread);
2064 			}
2065 		}
2066 		if (pri > uq->uq_inherited_pri)
2067 			pri = uq->uq_inherited_pri;
2068 		thread_lock(td);
2069 		sched_lend_user_prio(td, pri);
2070 		thread_unlock(td);
2071 		mtx_unlock(&umtx_lock);
2072 	}
2073 
2074 out:
2075 	umtxq_unbusy_unlocked(&uq->uq_key);
2076 	umtx_key_release(&uq->uq_key);
2077 	return (error);
2078 }
2079 
2080 /*
2081  * Unlock a PP mutex.
2082  */
2083 static int
2084 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2085 {
2086 	struct umtx_key key;
2087 	struct umtx_q *uq, *uq2;
2088 	struct umtx_pi *pi;
2089 	uint32_t owner, id;
2090 	uint32_t rceiling;
2091 	int error, pri, new_inherited_pri, su;
2092 
2093 	id = td->td_tid;
2094 	uq = td->td_umtxq;
2095 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2096 
2097 	/*
2098 	 * Make sure we own this mtx.
2099 	 */
2100 	error = fueword32(&m->m_owner, &owner);
2101 	if (error == -1)
2102 		return (EFAULT);
2103 
2104 	if ((owner & ~UMUTEX_CONTESTED) != id)
2105 		return (EPERM);
2106 
2107 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2108 	if (error != 0)
2109 		return (error);
2110 
2111 	if (rceiling == -1)
2112 		new_inherited_pri = PRI_MAX;
2113 	else {
2114 		rceiling = RTP_PRIO_MAX - rceiling;
2115 		if (rceiling > RTP_PRIO_MAX)
2116 			return (EINVAL);
2117 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2118 	}
2119 
2120 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2121 	    &key)) != 0)
2122 		return (error);
2123 	umtxq_lock(&key);
2124 	umtxq_busy(&key);
2125 	umtxq_unlock(&key);
2126 	/*
2127 	 * For priority protected mutex, always set unlocked state
2128 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2129 	 * to lock the mutex, it is necessary because thread priority
2130 	 * has to be adjusted for such mutex.
2131 	 */
2132 	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2133 
2134 	umtxq_lock(&key);
2135 	if (error == 0)
2136 		umtxq_signal(&key, 1);
2137 	umtxq_unbusy(&key);
2138 	umtxq_unlock(&key);
2139 
2140 	if (error == -1)
2141 		error = EFAULT;
2142 	else {
2143 		mtx_lock(&umtx_lock);
2144 		if (su != 0)
2145 			uq->uq_inherited_pri = new_inherited_pri;
2146 		pri = PRI_MAX;
2147 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2148 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2149 			if (uq2 != NULL) {
2150 				if (pri > UPRI(uq2->uq_thread))
2151 					pri = UPRI(uq2->uq_thread);
2152 			}
2153 		}
2154 		if (pri > uq->uq_inherited_pri)
2155 			pri = uq->uq_inherited_pri;
2156 		thread_lock(td);
2157 		sched_lend_user_prio(td, pri);
2158 		thread_unlock(td);
2159 		mtx_unlock(&umtx_lock);
2160 	}
2161 	umtx_key_release(&key);
2162 	return (error);
2163 }
2164 
2165 static int
2166 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2167 	uint32_t *old_ceiling)
2168 {
2169 	struct umtx_q *uq;
2170 	uint32_t save_ceiling;
2171 	uint32_t owner, id;
2172 	uint32_t flags;
2173 	int error, rv;
2174 
2175 	error = fueword32(&m->m_flags, &flags);
2176 	if (error == -1)
2177 		return (EFAULT);
2178 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2179 		return (EINVAL);
2180 	if (ceiling > RTP_PRIO_MAX)
2181 		return (EINVAL);
2182 	id = td->td_tid;
2183 	uq = td->td_umtxq;
2184 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2185 	   &uq->uq_key)) != 0)
2186 		return (error);
2187 	for (;;) {
2188 		umtxq_lock(&uq->uq_key);
2189 		umtxq_busy(&uq->uq_key);
2190 		umtxq_unlock(&uq->uq_key);
2191 
2192 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2193 		if (rv == -1) {
2194 			error = EFAULT;
2195 			break;
2196 		}
2197 
2198 		rv = casueword32(&m->m_owner,
2199 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2200 		if (rv == -1) {
2201 			error = EFAULT;
2202 			break;
2203 		}
2204 
2205 		if (owner == UMUTEX_CONTESTED) {
2206 			suword32(&m->m_ceilings[0], ceiling);
2207 			suword32(&m->m_owner, UMUTEX_CONTESTED);
2208 			error = 0;
2209 			break;
2210 		}
2211 
2212 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2213 			suword32(&m->m_ceilings[0], ceiling);
2214 			error = 0;
2215 			break;
2216 		}
2217 
2218 		/*
2219 		 * If we caught a signal, we have retried and now
2220 		 * exit immediately.
2221 		 */
2222 		if (error != 0)
2223 			break;
2224 
2225 		/*
2226 		 * We set the contested bit, sleep. Otherwise the lock changed
2227 		 * and we need to retry or we lost a race to the thread
2228 		 * unlocking the umtx.
2229 		 */
2230 		umtxq_lock(&uq->uq_key);
2231 		umtxq_insert(uq);
2232 		umtxq_unbusy(&uq->uq_key);
2233 		error = umtxq_sleep(uq, "umtxpp", NULL);
2234 		umtxq_remove(uq);
2235 		umtxq_unlock(&uq->uq_key);
2236 	}
2237 	umtxq_lock(&uq->uq_key);
2238 	if (error == 0)
2239 		umtxq_signal(&uq->uq_key, INT_MAX);
2240 	umtxq_unbusy(&uq->uq_key);
2241 	umtxq_unlock(&uq->uq_key);
2242 	umtx_key_release(&uq->uq_key);
2243 	if (error == 0 && old_ceiling != NULL)
2244 		suword32(old_ceiling, save_ceiling);
2245 	return (error);
2246 }
2247 
2248 /*
2249  * Lock a userland POSIX mutex.
2250  */
2251 static int
2252 do_lock_umutex(struct thread *td, struct umutex *m,
2253     struct _umtx_time *timeout, int mode)
2254 {
2255 	uint32_t flags;
2256 	int error;
2257 
2258 	error = fueword32(&m->m_flags, &flags);
2259 	if (error == -1)
2260 		return (EFAULT);
2261 
2262 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2263 	case 0:
2264 		error = do_lock_normal(td, m, flags, timeout, mode);
2265 		break;
2266 	case UMUTEX_PRIO_INHERIT:
2267 		error = do_lock_pi(td, m, flags, timeout, mode);
2268 		break;
2269 	case UMUTEX_PRIO_PROTECT:
2270 		error = do_lock_pp(td, m, flags, timeout, mode);
2271 		break;
2272 	default:
2273 		return (EINVAL);
2274 	}
2275 	if (timeout == NULL) {
2276 		if (error == EINTR && mode != _UMUTEX_WAIT)
2277 			error = ERESTART;
2278 	} else {
2279 		/* Timed-locking is not restarted. */
2280 		if (error == ERESTART)
2281 			error = EINTR;
2282 	}
2283 	return (error);
2284 }
2285 
2286 /*
2287  * Unlock a userland POSIX mutex.
2288  */
2289 static int
2290 do_unlock_umutex(struct thread *td, struct umutex *m)
2291 {
2292 	uint32_t flags;
2293 	int error;
2294 
2295 	error = fueword32(&m->m_flags, &flags);
2296 	if (error == -1)
2297 		return (EFAULT);
2298 
2299 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2300 	case 0:
2301 		return (do_unlock_normal(td, m, flags));
2302 	case UMUTEX_PRIO_INHERIT:
2303 		return (do_unlock_pi(td, m, flags));
2304 	case UMUTEX_PRIO_PROTECT:
2305 		return (do_unlock_pp(td, m, flags));
2306 	}
2307 
2308 	return (EINVAL);
2309 }
2310 
2311 static int
2312 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2313 	struct timespec *timeout, u_long wflags)
2314 {
2315 	struct abs_timeout timo;
2316 	struct umtx_q *uq;
2317 	uint32_t flags, clockid, hasw;
2318 	int error;
2319 
2320 	uq = td->td_umtxq;
2321 	error = fueword32(&cv->c_flags, &flags);
2322 	if (error == -1)
2323 		return (EFAULT);
2324 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2325 	if (error != 0)
2326 		return (error);
2327 
2328 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2329 		error = fueword32(&cv->c_clockid, &clockid);
2330 		if (error == -1) {
2331 			umtx_key_release(&uq->uq_key);
2332 			return (EFAULT);
2333 		}
2334 		if (clockid < CLOCK_REALTIME ||
2335 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2336 			/* hmm, only HW clock id will work. */
2337 			umtx_key_release(&uq->uq_key);
2338 			return (EINVAL);
2339 		}
2340 	} else {
2341 		clockid = CLOCK_REALTIME;
2342 	}
2343 
2344 	umtxq_lock(&uq->uq_key);
2345 	umtxq_busy(&uq->uq_key);
2346 	umtxq_insert(uq);
2347 	umtxq_unlock(&uq->uq_key);
2348 
2349 	/*
2350 	 * Set c_has_waiters to 1 before releasing user mutex, also
2351 	 * don't modify cache line when unnecessary.
2352 	 */
2353 	error = fueword32(&cv->c_has_waiters, &hasw);
2354 	if (error == 0 && hasw == 0)
2355 		suword32(&cv->c_has_waiters, 1);
2356 
2357 	umtxq_unbusy_unlocked(&uq->uq_key);
2358 
2359 	error = do_unlock_umutex(td, m);
2360 
2361 	if (timeout != NULL)
2362 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2363 			timeout);
2364 
2365 	umtxq_lock(&uq->uq_key);
2366 	if (error == 0) {
2367 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2368 		    NULL : &timo);
2369 	}
2370 
2371 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2372 		error = 0;
2373 	else {
2374 		/*
2375 		 * This must be timeout,interrupted by signal or
2376 		 * surprious wakeup, clear c_has_waiter flag when
2377 		 * necessary.
2378 		 */
2379 		umtxq_busy(&uq->uq_key);
2380 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2381 			int oldlen = uq->uq_cur_queue->length;
2382 			umtxq_remove(uq);
2383 			if (oldlen == 1) {
2384 				umtxq_unlock(&uq->uq_key);
2385 				suword32(&cv->c_has_waiters, 0);
2386 				umtxq_lock(&uq->uq_key);
2387 			}
2388 		}
2389 		umtxq_unbusy(&uq->uq_key);
2390 		if (error == ERESTART)
2391 			error = EINTR;
2392 	}
2393 
2394 	umtxq_unlock(&uq->uq_key);
2395 	umtx_key_release(&uq->uq_key);
2396 	return (error);
2397 }
2398 
2399 /*
2400  * Signal a userland condition variable.
2401  */
2402 static int
2403 do_cv_signal(struct thread *td, struct ucond *cv)
2404 {
2405 	struct umtx_key key;
2406 	int error, cnt, nwake;
2407 	uint32_t flags;
2408 
2409 	error = fueword32(&cv->c_flags, &flags);
2410 	if (error == -1)
2411 		return (EFAULT);
2412 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2413 		return (error);
2414 	umtxq_lock(&key);
2415 	umtxq_busy(&key);
2416 	cnt = umtxq_count(&key);
2417 	nwake = umtxq_signal(&key, 1);
2418 	if (cnt <= nwake) {
2419 		umtxq_unlock(&key);
2420 		error = suword32(&cv->c_has_waiters, 0);
2421 		if (error == -1)
2422 			error = EFAULT;
2423 		umtxq_lock(&key);
2424 	}
2425 	umtxq_unbusy(&key);
2426 	umtxq_unlock(&key);
2427 	umtx_key_release(&key);
2428 	return (error);
2429 }
2430 
2431 static int
2432 do_cv_broadcast(struct thread *td, struct ucond *cv)
2433 {
2434 	struct umtx_key key;
2435 	int error;
2436 	uint32_t flags;
2437 
2438 	error = fueword32(&cv->c_flags, &flags);
2439 	if (error == -1)
2440 		return (EFAULT);
2441 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2442 		return (error);
2443 
2444 	umtxq_lock(&key);
2445 	umtxq_busy(&key);
2446 	umtxq_signal(&key, INT_MAX);
2447 	umtxq_unlock(&key);
2448 
2449 	error = suword32(&cv->c_has_waiters, 0);
2450 	if (error == -1)
2451 		error = EFAULT;
2452 
2453 	umtxq_unbusy_unlocked(&key);
2454 
2455 	umtx_key_release(&key);
2456 	return (error);
2457 }
2458 
2459 static int
2460 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2461 {
2462 	struct abs_timeout timo;
2463 	struct umtx_q *uq;
2464 	uint32_t flags, wrflags;
2465 	int32_t state, oldstate;
2466 	int32_t blocked_readers;
2467 	int error, rv;
2468 
2469 	uq = td->td_umtxq;
2470 	error = fueword32(&rwlock->rw_flags, &flags);
2471 	if (error == -1)
2472 		return (EFAULT);
2473 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2474 	if (error != 0)
2475 		return (error);
2476 
2477 	if (timeout != NULL)
2478 		abs_timeout_init2(&timo, timeout);
2479 
2480 	wrflags = URWLOCK_WRITE_OWNER;
2481 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2482 		wrflags |= URWLOCK_WRITE_WAITERS;
2483 
2484 	for (;;) {
2485 		rv = fueword32(&rwlock->rw_state, &state);
2486 		if (rv == -1) {
2487 			umtx_key_release(&uq->uq_key);
2488 			return (EFAULT);
2489 		}
2490 
2491 		/* try to lock it */
2492 		while (!(state & wrflags)) {
2493 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2494 				umtx_key_release(&uq->uq_key);
2495 				return (EAGAIN);
2496 			}
2497 			rv = casueword32(&rwlock->rw_state, state,
2498 			    &oldstate, state + 1);
2499 			if (rv == -1) {
2500 				umtx_key_release(&uq->uq_key);
2501 				return (EFAULT);
2502 			}
2503 			if (oldstate == state) {
2504 				umtx_key_release(&uq->uq_key);
2505 				return (0);
2506 			}
2507 			error = umtxq_check_susp(td);
2508 			if (error != 0)
2509 				break;
2510 			state = oldstate;
2511 		}
2512 
2513 		if (error)
2514 			break;
2515 
2516 		/* grab monitor lock */
2517 		umtxq_lock(&uq->uq_key);
2518 		umtxq_busy(&uq->uq_key);
2519 		umtxq_unlock(&uq->uq_key);
2520 
2521 		/*
2522 		 * re-read the state, in case it changed between the try-lock above
2523 		 * and the check below
2524 		 */
2525 		rv = fueword32(&rwlock->rw_state, &state);
2526 		if (rv == -1)
2527 			error = EFAULT;
2528 
2529 		/* set read contention bit */
2530 		while (error == 0 && (state & wrflags) &&
2531 		    !(state & URWLOCK_READ_WAITERS)) {
2532 			rv = casueword32(&rwlock->rw_state, state,
2533 			    &oldstate, state | URWLOCK_READ_WAITERS);
2534 			if (rv == -1) {
2535 				error = EFAULT;
2536 				break;
2537 			}
2538 			if (oldstate == state)
2539 				goto sleep;
2540 			state = oldstate;
2541 			error = umtxq_check_susp(td);
2542 			if (error != 0)
2543 				break;
2544 		}
2545 		if (error != 0) {
2546 			umtxq_unbusy_unlocked(&uq->uq_key);
2547 			break;
2548 		}
2549 
2550 		/* state is changed while setting flags, restart */
2551 		if (!(state & wrflags)) {
2552 			umtxq_unbusy_unlocked(&uq->uq_key);
2553 			error = umtxq_check_susp(td);
2554 			if (error != 0)
2555 				break;
2556 			continue;
2557 		}
2558 
2559 sleep:
2560 		/* contention bit is set, before sleeping, increase read waiter count */
2561 		rv = fueword32(&rwlock->rw_blocked_readers,
2562 		    &blocked_readers);
2563 		if (rv == -1) {
2564 			umtxq_unbusy_unlocked(&uq->uq_key);
2565 			error = EFAULT;
2566 			break;
2567 		}
2568 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2569 
2570 		while (state & wrflags) {
2571 			umtxq_lock(&uq->uq_key);
2572 			umtxq_insert(uq);
2573 			umtxq_unbusy(&uq->uq_key);
2574 
2575 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2576 			    NULL : &timo);
2577 
2578 			umtxq_busy(&uq->uq_key);
2579 			umtxq_remove(uq);
2580 			umtxq_unlock(&uq->uq_key);
2581 			if (error)
2582 				break;
2583 			rv = fueword32(&rwlock->rw_state, &state);
2584 			if (rv == -1) {
2585 				error = EFAULT;
2586 				break;
2587 			}
2588 		}
2589 
2590 		/* decrease read waiter count, and may clear read contention bit */
2591 		rv = fueword32(&rwlock->rw_blocked_readers,
2592 		    &blocked_readers);
2593 		if (rv == -1) {
2594 			umtxq_unbusy_unlocked(&uq->uq_key);
2595 			error = EFAULT;
2596 			break;
2597 		}
2598 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2599 		if (blocked_readers == 1) {
2600 			rv = fueword32(&rwlock->rw_state, &state);
2601 			if (rv == -1)
2602 				error = EFAULT;
2603 			while (error == 0) {
2604 				rv = casueword32(&rwlock->rw_state, state,
2605 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2606 				if (rv == -1) {
2607 					error = EFAULT;
2608 					break;
2609 				}
2610 				if (oldstate == state)
2611 					break;
2612 				state = oldstate;
2613 				error = umtxq_check_susp(td);
2614 			}
2615 		}
2616 
2617 		umtxq_unbusy_unlocked(&uq->uq_key);
2618 		if (error != 0)
2619 			break;
2620 	}
2621 	umtx_key_release(&uq->uq_key);
2622 	if (error == ERESTART)
2623 		error = EINTR;
2624 	return (error);
2625 }
2626 
2627 static int
2628 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2629 {
2630 	struct abs_timeout timo;
2631 	struct umtx_q *uq;
2632 	uint32_t flags;
2633 	int32_t state, oldstate;
2634 	int32_t blocked_writers;
2635 	int32_t blocked_readers;
2636 	int error, rv;
2637 
2638 	uq = td->td_umtxq;
2639 	error = fueword32(&rwlock->rw_flags, &flags);
2640 	if (error == -1)
2641 		return (EFAULT);
2642 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2643 	if (error != 0)
2644 		return (error);
2645 
2646 	if (timeout != NULL)
2647 		abs_timeout_init2(&timo, timeout);
2648 
2649 	blocked_readers = 0;
2650 	for (;;) {
2651 		rv = fueword32(&rwlock->rw_state, &state);
2652 		if (rv == -1) {
2653 			umtx_key_release(&uq->uq_key);
2654 			return (EFAULT);
2655 		}
2656 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2657 			rv = casueword32(&rwlock->rw_state, state,
2658 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2659 			if (rv == -1) {
2660 				umtx_key_release(&uq->uq_key);
2661 				return (EFAULT);
2662 			}
2663 			if (oldstate == state) {
2664 				umtx_key_release(&uq->uq_key);
2665 				return (0);
2666 			}
2667 			state = oldstate;
2668 			error = umtxq_check_susp(td);
2669 			if (error != 0)
2670 				break;
2671 		}
2672 
2673 		if (error) {
2674 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2675 			    blocked_readers != 0) {
2676 				umtxq_lock(&uq->uq_key);
2677 				umtxq_busy(&uq->uq_key);
2678 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2679 				umtxq_unbusy(&uq->uq_key);
2680 				umtxq_unlock(&uq->uq_key);
2681 			}
2682 
2683 			break;
2684 		}
2685 
2686 		/* grab monitor lock */
2687 		umtxq_lock(&uq->uq_key);
2688 		umtxq_busy(&uq->uq_key);
2689 		umtxq_unlock(&uq->uq_key);
2690 
2691 		/*
2692 		 * re-read the state, in case it changed between the try-lock above
2693 		 * and the check below
2694 		 */
2695 		rv = fueword32(&rwlock->rw_state, &state);
2696 		if (rv == -1)
2697 			error = EFAULT;
2698 
2699 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2700 		    URWLOCK_READER_COUNT(state) != 0) &&
2701 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2702 			rv = casueword32(&rwlock->rw_state, state,
2703 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2704 			if (rv == -1) {
2705 				error = EFAULT;
2706 				break;
2707 			}
2708 			if (oldstate == state)
2709 				goto sleep;
2710 			state = oldstate;
2711 			error = umtxq_check_susp(td);
2712 			if (error != 0)
2713 				break;
2714 		}
2715 		if (error != 0) {
2716 			umtxq_unbusy_unlocked(&uq->uq_key);
2717 			break;
2718 		}
2719 
2720 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2721 			umtxq_unbusy_unlocked(&uq->uq_key);
2722 			error = umtxq_check_susp(td);
2723 			if (error != 0)
2724 				break;
2725 			continue;
2726 		}
2727 sleep:
2728 		rv = fueword32(&rwlock->rw_blocked_writers,
2729 		    &blocked_writers);
2730 		if (rv == -1) {
2731 			umtxq_unbusy_unlocked(&uq->uq_key);
2732 			error = EFAULT;
2733 			break;
2734 		}
2735 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2736 
2737 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2738 			umtxq_lock(&uq->uq_key);
2739 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2740 			umtxq_unbusy(&uq->uq_key);
2741 
2742 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2743 			    NULL : &timo);
2744 
2745 			umtxq_busy(&uq->uq_key);
2746 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2747 			umtxq_unlock(&uq->uq_key);
2748 			if (error)
2749 				break;
2750 			rv = fueword32(&rwlock->rw_state, &state);
2751 			if (rv == -1) {
2752 				error = EFAULT;
2753 				break;
2754 			}
2755 		}
2756 
2757 		rv = fueword32(&rwlock->rw_blocked_writers,
2758 		    &blocked_writers);
2759 		if (rv == -1) {
2760 			umtxq_unbusy_unlocked(&uq->uq_key);
2761 			error = EFAULT;
2762 			break;
2763 		}
2764 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2765 		if (blocked_writers == 1) {
2766 			rv = fueword32(&rwlock->rw_state, &state);
2767 			if (rv == -1) {
2768 				umtxq_unbusy_unlocked(&uq->uq_key);
2769 				error = EFAULT;
2770 				break;
2771 			}
2772 			for (;;) {
2773 				rv = casueword32(&rwlock->rw_state, state,
2774 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2775 				if (rv == -1) {
2776 					error = EFAULT;
2777 					break;
2778 				}
2779 				if (oldstate == state)
2780 					break;
2781 				state = oldstate;
2782 				error = umtxq_check_susp(td);
2783 				/*
2784 				 * We are leaving the URWLOCK_WRITE_WAITERS
2785 				 * behind, but this should not harm the
2786 				 * correctness.
2787 				 */
2788 				if (error != 0)
2789 					break;
2790 			}
2791 			rv = fueword32(&rwlock->rw_blocked_readers,
2792 			    &blocked_readers);
2793 			if (rv == -1) {
2794 				umtxq_unbusy_unlocked(&uq->uq_key);
2795 				error = EFAULT;
2796 				break;
2797 			}
2798 		} else
2799 			blocked_readers = 0;
2800 
2801 		umtxq_unbusy_unlocked(&uq->uq_key);
2802 	}
2803 
2804 	umtx_key_release(&uq->uq_key);
2805 	if (error == ERESTART)
2806 		error = EINTR;
2807 	return (error);
2808 }
2809 
2810 static int
2811 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2812 {
2813 	struct umtx_q *uq;
2814 	uint32_t flags;
2815 	int32_t state, oldstate;
2816 	int error, rv, q, count;
2817 
2818 	uq = td->td_umtxq;
2819 	error = fueword32(&rwlock->rw_flags, &flags);
2820 	if (error == -1)
2821 		return (EFAULT);
2822 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2823 	if (error != 0)
2824 		return (error);
2825 
2826 	error = fueword32(&rwlock->rw_state, &state);
2827 	if (error == -1) {
2828 		error = EFAULT;
2829 		goto out;
2830 	}
2831 	if (state & URWLOCK_WRITE_OWNER) {
2832 		for (;;) {
2833 			rv = casueword32(&rwlock->rw_state, state,
2834 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
2835 			if (rv == -1) {
2836 				error = EFAULT;
2837 				goto out;
2838 			}
2839 			if (oldstate != state) {
2840 				state = oldstate;
2841 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2842 					error = EPERM;
2843 					goto out;
2844 				}
2845 				error = umtxq_check_susp(td);
2846 				if (error != 0)
2847 					goto out;
2848 			} else
2849 				break;
2850 		}
2851 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2852 		for (;;) {
2853 			rv = casueword32(&rwlock->rw_state, state,
2854 			    &oldstate, state - 1);
2855 			if (rv == -1) {
2856 				error = EFAULT;
2857 				goto out;
2858 			}
2859 			if (oldstate != state) {
2860 				state = oldstate;
2861 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2862 					error = EPERM;
2863 					goto out;
2864 				}
2865 				error = umtxq_check_susp(td);
2866 				if (error != 0)
2867 					goto out;
2868 			} else
2869 				break;
2870 		}
2871 	} else {
2872 		error = EPERM;
2873 		goto out;
2874 	}
2875 
2876 	count = 0;
2877 
2878 	if (!(flags & URWLOCK_PREFER_READER)) {
2879 		if (state & URWLOCK_WRITE_WAITERS) {
2880 			count = 1;
2881 			q = UMTX_EXCLUSIVE_QUEUE;
2882 		} else if (state & URWLOCK_READ_WAITERS) {
2883 			count = INT_MAX;
2884 			q = UMTX_SHARED_QUEUE;
2885 		}
2886 	} else {
2887 		if (state & URWLOCK_READ_WAITERS) {
2888 			count = INT_MAX;
2889 			q = UMTX_SHARED_QUEUE;
2890 		} else if (state & URWLOCK_WRITE_WAITERS) {
2891 			count = 1;
2892 			q = UMTX_EXCLUSIVE_QUEUE;
2893 		}
2894 	}
2895 
2896 	if (count) {
2897 		umtxq_lock(&uq->uq_key);
2898 		umtxq_busy(&uq->uq_key);
2899 		umtxq_signal_queue(&uq->uq_key, count, q);
2900 		umtxq_unbusy(&uq->uq_key);
2901 		umtxq_unlock(&uq->uq_key);
2902 	}
2903 out:
2904 	umtx_key_release(&uq->uq_key);
2905 	return (error);
2906 }
2907 
2908 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
2909 static int
2910 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2911 {
2912 	struct abs_timeout timo;
2913 	struct umtx_q *uq;
2914 	uint32_t flags, count, count1;
2915 	int error, rv;
2916 
2917 	uq = td->td_umtxq;
2918 	error = fueword32(&sem->_flags, &flags);
2919 	if (error == -1)
2920 		return (EFAULT);
2921 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2922 	if (error != 0)
2923 		return (error);
2924 
2925 	if (timeout != NULL)
2926 		abs_timeout_init2(&timo, timeout);
2927 
2928 	umtxq_lock(&uq->uq_key);
2929 	umtxq_busy(&uq->uq_key);
2930 	umtxq_insert(uq);
2931 	umtxq_unlock(&uq->uq_key);
2932 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
2933 	if (rv == 0)
2934 		rv = fueword32(&sem->_count, &count);
2935 	if (rv == -1 || count != 0) {
2936 		umtxq_lock(&uq->uq_key);
2937 		umtxq_unbusy(&uq->uq_key);
2938 		umtxq_remove(uq);
2939 		umtxq_unlock(&uq->uq_key);
2940 		umtx_key_release(&uq->uq_key);
2941 		return (rv == -1 ? EFAULT : 0);
2942 	}
2943 	umtxq_lock(&uq->uq_key);
2944 	umtxq_unbusy(&uq->uq_key);
2945 
2946 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2947 
2948 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2949 		error = 0;
2950 	else {
2951 		umtxq_remove(uq);
2952 		/* A relative timeout cannot be restarted. */
2953 		if (error == ERESTART && timeout != NULL &&
2954 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2955 			error = EINTR;
2956 	}
2957 	umtxq_unlock(&uq->uq_key);
2958 	umtx_key_release(&uq->uq_key);
2959 	return (error);
2960 }
2961 
2962 /*
2963  * Signal a userland semaphore.
2964  */
2965 static int
2966 do_sem_wake(struct thread *td, struct _usem *sem)
2967 {
2968 	struct umtx_key key;
2969 	int error, cnt;
2970 	uint32_t flags;
2971 
2972 	error = fueword32(&sem->_flags, &flags);
2973 	if (error == -1)
2974 		return (EFAULT);
2975 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2976 		return (error);
2977 	umtxq_lock(&key);
2978 	umtxq_busy(&key);
2979 	cnt = umtxq_count(&key);
2980 	if (cnt > 0) {
2981 		umtxq_signal(&key, 1);
2982 		/*
2983 		 * Check if count is greater than 0, this means the memory is
2984 		 * still being referenced by user code, so we can safely
2985 		 * update _has_waiters flag.
2986 		 */
2987 		if (cnt == 1) {
2988 			umtxq_unlock(&key);
2989 			error = suword32(&sem->_has_waiters, 0);
2990 			umtxq_lock(&key);
2991 			if (error == -1)
2992 				error = EFAULT;
2993 		}
2994 	}
2995 	umtxq_unbusy(&key);
2996 	umtxq_unlock(&key);
2997 	umtx_key_release(&key);
2998 	return (error);
2999 }
3000 #endif
3001 
3002 static int
3003 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3004 {
3005 	struct abs_timeout timo;
3006 	struct umtx_q *uq;
3007 	uint32_t count, flags;
3008 	int error, rv;
3009 
3010 	uq = td->td_umtxq;
3011 	flags = fuword32(&sem->_flags);
3012 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3013 	if (error != 0)
3014 		return (error);
3015 
3016 	if (timeout != NULL)
3017 		abs_timeout_init2(&timo, timeout);
3018 
3019 	umtxq_lock(&uq->uq_key);
3020 	umtxq_busy(&uq->uq_key);
3021 	umtxq_insert(uq);
3022 	umtxq_unlock(&uq->uq_key);
3023 	rv = fueword32(&sem->_count, &count);
3024 	if (rv == -1) {
3025 		umtxq_lock(&uq->uq_key);
3026 		umtxq_unbusy(&uq->uq_key);
3027 		umtxq_remove(uq);
3028 		umtxq_unlock(&uq->uq_key);
3029 		umtx_key_release(&uq->uq_key);
3030 		return (EFAULT);
3031 	}
3032 	for (;;) {
3033 		if (USEM_COUNT(count) != 0) {
3034 			umtxq_lock(&uq->uq_key);
3035 			umtxq_unbusy(&uq->uq_key);
3036 			umtxq_remove(uq);
3037 			umtxq_unlock(&uq->uq_key);
3038 			umtx_key_release(&uq->uq_key);
3039 			return (0);
3040 		}
3041 		if (count == USEM_HAS_WAITERS)
3042 			break;
3043 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3044 		if (rv == -1) {
3045 			umtxq_lock(&uq->uq_key);
3046 			umtxq_unbusy(&uq->uq_key);
3047 			umtxq_remove(uq);
3048 			umtxq_unlock(&uq->uq_key);
3049 			umtx_key_release(&uq->uq_key);
3050 			return (EFAULT);
3051 		}
3052 		if (count == 0)
3053 			break;
3054 	}
3055 	umtxq_lock(&uq->uq_key);
3056 	umtxq_unbusy(&uq->uq_key);
3057 
3058 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3059 
3060 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3061 		error = 0;
3062 	else {
3063 		umtxq_remove(uq);
3064 		/* A relative timeout cannot be restarted. */
3065 		if (error == ERESTART && timeout != NULL &&
3066 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3067 			error = EINTR;
3068 	}
3069 	umtxq_unlock(&uq->uq_key);
3070 	umtx_key_release(&uq->uq_key);
3071 	return (error);
3072 }
3073 
3074 /*
3075  * Signal a userland semaphore.
3076  */
3077 static int
3078 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3079 {
3080 	struct umtx_key key;
3081 	int error, cnt, rv;
3082 	uint32_t count, flags;
3083 
3084 	rv = fueword32(&sem->_flags, &flags);
3085 	if (rv == -1)
3086 		return (EFAULT);
3087 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3088 		return (error);
3089 	umtxq_lock(&key);
3090 	umtxq_busy(&key);
3091 	cnt = umtxq_count(&key);
3092 	if (cnt > 0) {
3093 		umtxq_signal(&key, 1);
3094 
3095 		/*
3096 		 * If this was the last sleeping thread, clear the waiters
3097 		 * flag in _count.
3098 		 */
3099 		if (cnt == 1) {
3100 			umtxq_unlock(&key);
3101 			rv = fueword32(&sem->_count, &count);
3102 			while (rv != -1 && count & USEM_HAS_WAITERS)
3103 				rv = casueword32(&sem->_count, count, &count,
3104 				    count & ~USEM_HAS_WAITERS);
3105 			if (rv == -1)
3106 				error = EFAULT;
3107 			umtxq_lock(&key);
3108 		}
3109 	}
3110 	umtxq_unbusy(&key);
3111 	umtxq_unlock(&key);
3112 	umtx_key_release(&key);
3113 	return (error);
3114 }
3115 
3116 inline int
3117 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3118 {
3119 	int error;
3120 
3121 	error = copyin(addr, tsp, sizeof(struct timespec));
3122 	if (error == 0) {
3123 		if (tsp->tv_sec < 0 ||
3124 		    tsp->tv_nsec >= 1000000000 ||
3125 		    tsp->tv_nsec < 0)
3126 			error = EINVAL;
3127 	}
3128 	return (error);
3129 }
3130 
3131 static inline int
3132 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3133 {
3134 	int error;
3135 
3136 	if (size <= sizeof(struct timespec)) {
3137 		tp->_clockid = CLOCK_REALTIME;
3138 		tp->_flags = 0;
3139 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3140 	} else
3141 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3142 	if (error != 0)
3143 		return (error);
3144 	if (tp->_timeout.tv_sec < 0 ||
3145 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3146 		return (EINVAL);
3147 	return (0);
3148 }
3149 
3150 static int
3151 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3152 {
3153 
3154 	return (EOPNOTSUPP);
3155 }
3156 
3157 static int
3158 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3159 {
3160 	struct _umtx_time timeout, *tm_p;
3161 	int error;
3162 
3163 	if (uap->uaddr2 == NULL)
3164 		tm_p = NULL;
3165 	else {
3166 		error = umtx_copyin_umtx_time(
3167 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3168 		if (error != 0)
3169 			return (error);
3170 		tm_p = &timeout;
3171 	}
3172 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3173 }
3174 
3175 static int
3176 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3177 {
3178 	struct _umtx_time timeout, *tm_p;
3179 	int error;
3180 
3181 	if (uap->uaddr2 == NULL)
3182 		tm_p = NULL;
3183 	else {
3184 		error = umtx_copyin_umtx_time(
3185 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3186 		if (error != 0)
3187 			return (error);
3188 		tm_p = &timeout;
3189 	}
3190 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3191 }
3192 
3193 static int
3194 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3195 {
3196 	struct _umtx_time *tm_p, timeout;
3197 	int error;
3198 
3199 	if (uap->uaddr2 == NULL)
3200 		tm_p = NULL;
3201 	else {
3202 		error = umtx_copyin_umtx_time(
3203 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3204 		if (error != 0)
3205 			return (error);
3206 		tm_p = &timeout;
3207 	}
3208 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3209 }
3210 
3211 static int
3212 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3213 {
3214 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3215 }
3216 
3217 #define BATCH_SIZE	128
3218 static int
3219 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3220 {
3221 	int count = uap->val;
3222 	void *uaddrs[BATCH_SIZE];
3223 	char **upp = (char **)uap->obj;
3224 	int tocopy;
3225 	int error = 0;
3226 	int i, pos = 0;
3227 
3228 	while (count > 0) {
3229 		tocopy = count;
3230 		if (tocopy > BATCH_SIZE)
3231 			tocopy = BATCH_SIZE;
3232 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3233 		if (error != 0)
3234 			break;
3235 		for (i = 0; i < tocopy; ++i)
3236 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3237 		count -= tocopy;
3238 		pos += tocopy;
3239 	}
3240 	return (error);
3241 }
3242 
3243 static int
3244 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3245 {
3246 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3247 }
3248 
3249 static int
3250 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3251 {
3252 	struct _umtx_time *tm_p, timeout;
3253 	int error;
3254 
3255 	/* Allow a null timespec (wait forever). */
3256 	if (uap->uaddr2 == NULL)
3257 		tm_p = NULL;
3258 	else {
3259 		error = umtx_copyin_umtx_time(
3260 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3261 		if (error != 0)
3262 			return (error);
3263 		tm_p = &timeout;
3264 	}
3265 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3266 }
3267 
3268 static int
3269 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3270 {
3271 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3272 }
3273 
3274 static int
3275 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3276 {
3277 	struct _umtx_time *tm_p, timeout;
3278 	int error;
3279 
3280 	/* Allow a null timespec (wait forever). */
3281 	if (uap->uaddr2 == NULL)
3282 		tm_p = NULL;
3283 	else {
3284 		error = umtx_copyin_umtx_time(
3285 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3286 		if (error != 0)
3287 			return (error);
3288 		tm_p = &timeout;
3289 	}
3290 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3291 }
3292 
3293 static int
3294 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3295 {
3296 	return do_wake_umutex(td, uap->obj);
3297 }
3298 
3299 static int
3300 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3301 {
3302 	return do_unlock_umutex(td, uap->obj);
3303 }
3304 
3305 static int
3306 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3307 {
3308 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3309 }
3310 
3311 static int
3312 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3313 {
3314 	struct timespec *ts, timeout;
3315 	int error;
3316 
3317 	/* Allow a null timespec (wait forever). */
3318 	if (uap->uaddr2 == NULL)
3319 		ts = NULL;
3320 	else {
3321 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3322 		if (error != 0)
3323 			return (error);
3324 		ts = &timeout;
3325 	}
3326 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3327 }
3328 
3329 static int
3330 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3331 {
3332 	return do_cv_signal(td, uap->obj);
3333 }
3334 
3335 static int
3336 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3337 {
3338 	return do_cv_broadcast(td, uap->obj);
3339 }
3340 
3341 static int
3342 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3343 {
3344 	struct _umtx_time timeout;
3345 	int error;
3346 
3347 	/* Allow a null timespec (wait forever). */
3348 	if (uap->uaddr2 == NULL) {
3349 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3350 	} else {
3351 		error = umtx_copyin_umtx_time(uap->uaddr2,
3352 		   (size_t)uap->uaddr1, &timeout);
3353 		if (error != 0)
3354 			return (error);
3355 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3356 	}
3357 	return (error);
3358 }
3359 
3360 static int
3361 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3362 {
3363 	struct _umtx_time timeout;
3364 	int error;
3365 
3366 	/* Allow a null timespec (wait forever). */
3367 	if (uap->uaddr2 == NULL) {
3368 		error = do_rw_wrlock(td, uap->obj, 0);
3369 	} else {
3370 		error = umtx_copyin_umtx_time(uap->uaddr2,
3371 		   (size_t)uap->uaddr1, &timeout);
3372 		if (error != 0)
3373 			return (error);
3374 
3375 		error = do_rw_wrlock(td, uap->obj, &timeout);
3376 	}
3377 	return (error);
3378 }
3379 
3380 static int
3381 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3382 {
3383 	return do_rw_unlock(td, uap->obj);
3384 }
3385 
3386 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3387 static int
3388 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3389 {
3390 	struct _umtx_time *tm_p, timeout;
3391 	int error;
3392 
3393 	/* Allow a null timespec (wait forever). */
3394 	if (uap->uaddr2 == NULL)
3395 		tm_p = NULL;
3396 	else {
3397 		error = umtx_copyin_umtx_time(
3398 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3399 		if (error != 0)
3400 			return (error);
3401 		tm_p = &timeout;
3402 	}
3403 	return (do_sem_wait(td, uap->obj, tm_p));
3404 }
3405 
3406 static int
3407 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3408 {
3409 	return do_sem_wake(td, uap->obj);
3410 }
3411 #endif
3412 
3413 static int
3414 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3415 {
3416 	return do_wake2_umutex(td, uap->obj, uap->val);
3417 }
3418 
3419 static int
3420 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3421 {
3422 	struct _umtx_time *tm_p, timeout;
3423 	int error;
3424 
3425 	/* Allow a null timespec (wait forever). */
3426 	if (uap->uaddr2 == NULL)
3427 		tm_p = NULL;
3428 	else {
3429 		error = umtx_copyin_umtx_time(
3430 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3431 		if (error != 0)
3432 			return (error);
3433 		tm_p = &timeout;
3434 	}
3435 	return (do_sem2_wait(td, uap->obj, tm_p));
3436 }
3437 
3438 static int
3439 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3440 {
3441 	return do_sem2_wake(td, uap->obj);
3442 }
3443 
3444 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3445 
3446 static _umtx_op_func op_table[] = {
3447 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3448 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3449 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3450 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3451 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3452 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3453 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3454 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3455 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3456 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3457 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3458 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3459 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3460 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3461 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3462 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3463 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3464 	__umtx_op_wait_umutex,		/* UMTX_OP_MUTEX_WAIT */
3465 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3466 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3467 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3468 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3469 #else
3470 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAIT */
3471 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAKE */
3472 #endif
3473 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3474 	__umtx_op_wake2_umutex,		/* UMTX_OP_MUTEX_WAKE2 */
3475 	__umtx_op_sem2_wait,		/* UMTX_OP_SEM2_WAIT */
3476 	__umtx_op_sem2_wake,		/* UMTX_OP_SEM2_WAKE */
3477 };
3478 
3479 int
3480 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3481 {
3482 	if ((unsigned)uap->op < UMTX_OP_MAX)
3483 		return (*op_table[uap->op])(td, uap);
3484 	return (EINVAL);
3485 }
3486 
3487 #ifdef COMPAT_FREEBSD32
3488 
3489 struct timespec32 {
3490 	int32_t tv_sec;
3491 	int32_t tv_nsec;
3492 };
3493 
3494 struct umtx_time32 {
3495 	struct	timespec32	timeout;
3496 	uint32_t		flags;
3497 	uint32_t		clockid;
3498 };
3499 
3500 static inline int
3501 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3502 {
3503 	struct timespec32 ts32;
3504 	int error;
3505 
3506 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3507 	if (error == 0) {
3508 		if (ts32.tv_sec < 0 ||
3509 		    ts32.tv_nsec >= 1000000000 ||
3510 		    ts32.tv_nsec < 0)
3511 			error = EINVAL;
3512 		else {
3513 			tsp->tv_sec = ts32.tv_sec;
3514 			tsp->tv_nsec = ts32.tv_nsec;
3515 		}
3516 	}
3517 	return (error);
3518 }
3519 
3520 static inline int
3521 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3522 {
3523 	struct umtx_time32 t32;
3524 	int error;
3525 
3526 	t32.clockid = CLOCK_REALTIME;
3527 	t32.flags   = 0;
3528 	if (size <= sizeof(struct timespec32))
3529 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3530 	else
3531 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3532 	if (error != 0)
3533 		return (error);
3534 	if (t32.timeout.tv_sec < 0 ||
3535 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3536 		return (EINVAL);
3537 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3538 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3539 	tp->_flags = t32.flags;
3540 	tp->_clockid = t32.clockid;
3541 	return (0);
3542 }
3543 
3544 static int
3545 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3546 {
3547 	struct _umtx_time *tm_p, timeout;
3548 	int error;
3549 
3550 	if (uap->uaddr2 == NULL)
3551 		tm_p = NULL;
3552 	else {
3553 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3554 			(size_t)uap->uaddr1, &timeout);
3555 		if (error != 0)
3556 			return (error);
3557 		tm_p = &timeout;
3558 	}
3559 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3560 }
3561 
3562 static int
3563 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3564 {
3565 	struct _umtx_time *tm_p, timeout;
3566 	int error;
3567 
3568 	/* Allow a null timespec (wait forever). */
3569 	if (uap->uaddr2 == NULL)
3570 		tm_p = NULL;
3571 	else {
3572 		error = umtx_copyin_umtx_time(uap->uaddr2,
3573 			    (size_t)uap->uaddr1, &timeout);
3574 		if (error != 0)
3575 			return (error);
3576 		tm_p = &timeout;
3577 	}
3578 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3579 }
3580 
3581 static int
3582 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3583 {
3584 	struct _umtx_time *tm_p, timeout;
3585 	int error;
3586 
3587 	/* Allow a null timespec (wait forever). */
3588 	if (uap->uaddr2 == NULL)
3589 		tm_p = NULL;
3590 	else {
3591 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3592 		    (size_t)uap->uaddr1, &timeout);
3593 		if (error != 0)
3594 			return (error);
3595 		tm_p = &timeout;
3596 	}
3597 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3598 }
3599 
3600 static int
3601 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3602 {
3603 	struct timespec *ts, timeout;
3604 	int error;
3605 
3606 	/* Allow a null timespec (wait forever). */
3607 	if (uap->uaddr2 == NULL)
3608 		ts = NULL;
3609 	else {
3610 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3611 		if (error != 0)
3612 			return (error);
3613 		ts = &timeout;
3614 	}
3615 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3616 }
3617 
3618 static int
3619 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3620 {
3621 	struct _umtx_time timeout;
3622 	int error;
3623 
3624 	/* Allow a null timespec (wait forever). */
3625 	if (uap->uaddr2 == NULL) {
3626 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3627 	} else {
3628 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3629 		    (size_t)uap->uaddr1, &timeout);
3630 		if (error != 0)
3631 			return (error);
3632 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3633 	}
3634 	return (error);
3635 }
3636 
3637 static int
3638 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3639 {
3640 	struct _umtx_time timeout;
3641 	int error;
3642 
3643 	/* Allow a null timespec (wait forever). */
3644 	if (uap->uaddr2 == NULL) {
3645 		error = do_rw_wrlock(td, uap->obj, 0);
3646 	} else {
3647 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3648 		    (size_t)uap->uaddr1, &timeout);
3649 		if (error != 0)
3650 			return (error);
3651 		error = do_rw_wrlock(td, uap->obj, &timeout);
3652 	}
3653 	return (error);
3654 }
3655 
3656 static int
3657 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3658 {
3659 	struct _umtx_time *tm_p, timeout;
3660 	int error;
3661 
3662 	if (uap->uaddr2 == NULL)
3663 		tm_p = NULL;
3664 	else {
3665 		error = umtx_copyin_umtx_time32(
3666 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3667 		if (error != 0)
3668 			return (error);
3669 		tm_p = &timeout;
3670 	}
3671 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3672 }
3673 
3674 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3675 static int
3676 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3677 {
3678 	struct _umtx_time *tm_p, timeout;
3679 	int error;
3680 
3681 	/* Allow a null timespec (wait forever). */
3682 	if (uap->uaddr2 == NULL)
3683 		tm_p = NULL;
3684 	else {
3685 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3686 		    (size_t)uap->uaddr1, &timeout);
3687 		if (error != 0)
3688 			return (error);
3689 		tm_p = &timeout;
3690 	}
3691 	return (do_sem_wait(td, uap->obj, tm_p));
3692 }
3693 #endif
3694 
3695 static int
3696 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3697 {
3698 	struct _umtx_time *tm_p, timeout;
3699 	int error;
3700 
3701 	/* Allow a null timespec (wait forever). */
3702 	if (uap->uaddr2 == NULL)
3703 		tm_p = NULL;
3704 	else {
3705 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3706 		    (size_t)uap->uaddr1, &timeout);
3707 		if (error != 0)
3708 			return (error);
3709 		tm_p = &timeout;
3710 	}
3711 	return (do_sem2_wait(td, uap->obj, tm_p));
3712 }
3713 
3714 static int
3715 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3716 {
3717 	int count = uap->val;
3718 	uint32_t uaddrs[BATCH_SIZE];
3719 	uint32_t **upp = (uint32_t **)uap->obj;
3720 	int tocopy;
3721 	int error = 0;
3722 	int i, pos = 0;
3723 
3724 	while (count > 0) {
3725 		tocopy = count;
3726 		if (tocopy > BATCH_SIZE)
3727 			tocopy = BATCH_SIZE;
3728 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3729 		if (error != 0)
3730 			break;
3731 		for (i = 0; i < tocopy; ++i)
3732 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3733 				INT_MAX, 1);
3734 		count -= tocopy;
3735 		pos += tocopy;
3736 	}
3737 	return (error);
3738 }
3739 
3740 static _umtx_op_func op_table_compat32[] = {
3741 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3742 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3743 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3744 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3745 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3746 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3747 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3748 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3749 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3750 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3751 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3752 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3753 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3754 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3755 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3756 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3757 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3758 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_MUTEX_WAIT */
3759 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3760 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3761 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3762 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3763 #else
3764 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAIT */
3765 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAKE */
3766 #endif
3767 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3768 	__umtx_op_wake2_umutex,		/* UMTX_OP_MUTEX_WAKE2 */
3769 	__umtx_op_sem2_wait_compat32,	/* UMTX_OP_SEM2_WAIT */
3770 	__umtx_op_sem2_wake,		/* UMTX_OP_SEM2_WAKE */
3771 };
3772 
3773 int
3774 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3775 {
3776 	if ((unsigned)uap->op < UMTX_OP_MAX)
3777 		return (*op_table_compat32[uap->op])(td,
3778 			(struct _umtx_op_args *)uap);
3779 	return (EINVAL);
3780 }
3781 #endif
3782 
3783 void
3784 umtx_thread_init(struct thread *td)
3785 {
3786 	td->td_umtxq = umtxq_alloc();
3787 	td->td_umtxq->uq_thread = td;
3788 }
3789 
3790 void
3791 umtx_thread_fini(struct thread *td)
3792 {
3793 	umtxq_free(td->td_umtxq);
3794 }
3795 
3796 /*
3797  * It will be called when new thread is created, e.g fork().
3798  */
3799 void
3800 umtx_thread_alloc(struct thread *td)
3801 {
3802 	struct umtx_q *uq;
3803 
3804 	uq = td->td_umtxq;
3805 	uq->uq_inherited_pri = PRI_MAX;
3806 
3807 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3808 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3809 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3810 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3811 }
3812 
3813 /*
3814  * exec() hook.
3815  */
3816 static void
3817 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3818 	struct image_params *imgp __unused)
3819 {
3820 	umtx_thread_cleanup(curthread);
3821 }
3822 
3823 /*
3824  * thread_exit() hook.
3825  */
3826 void
3827 umtx_thread_exit(struct thread *td)
3828 {
3829 	umtx_thread_cleanup(td);
3830 }
3831 
3832 /*
3833  * clean up umtx data.
3834  */
3835 static void
3836 umtx_thread_cleanup(struct thread *td)
3837 {
3838 	struct umtx_q *uq;
3839 	struct umtx_pi *pi;
3840 
3841 	if ((uq = td->td_umtxq) == NULL)
3842 		return;
3843 
3844 	mtx_lock(&umtx_lock);
3845 	uq->uq_inherited_pri = PRI_MAX;
3846 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3847 		pi->pi_owner = NULL;
3848 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3849 	}
3850 	mtx_unlock(&umtx_lock);
3851 	thread_lock(td);
3852 	sched_lend_user_prio(td, PRI_MAX);
3853 	thread_unlock(td);
3854 }
3855