xref: /freebsd/sys/compat/linux/linux_futex.c (revision e0d3ea8c655d56fd364dba6d91dec8d3e443cb80)
1 /*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
2 
3 /*-
4  * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *	This product includes software developed by Emmanuel Dreyfus
17  * 4. The name of the author may not be used to endorse or promote
18  *    products derived from this software without specific prior written
19  *    permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 #if 0
37 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
38 #endif
39 
40 #include "opt_compat.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/imgact.h>
45 #include <sys/kernel.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mutex.h>
50 #include <sys/priv.h>
51 #include <sys/proc.h>
52 #include <sys/queue.h>
53 #include <sys/sched.h>
54 #include <sys/sdt.h>
55 #include <sys/sx.h>
56 #include <sys/umtx.h>
57 
58 #ifdef COMPAT_LINUX32
59 #include <machine/../linux32/linux.h>
60 #include <machine/../linux32/linux32_proto.h>
61 #else
62 #include <machine/../linux/linux.h>
63 #include <machine/../linux/linux_proto.h>
64 #endif
65 #include <compat/linux/linux_dtrace.h>
66 #include <compat/linux/linux_emul.h>
67 #include <compat/linux/linux_futex.h>
68 #include <compat/linux/linux_util.h>
69 
70 /* DTrace init */
71 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
72 
73 /**
74  * Futex part for the special DTrace module "locks".
75  */
76 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *");
77 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *");
78 
79 /**
80  * Per futex probes.
81  */
82 LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *");
83 LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *");
84 
85 /**
86  * DTrace probes in this module.
87  */
88 LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *",
89     "struct waiting_proc *");
90 LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t",
91     "int");
92 LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t",
93     "int");
94 LIN_SDT_PROBE_DEFINE0(futex, futex_put, return);
95 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **",
96     "uint32_t");
97 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int");
98 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t",
99     "int");
100 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *");
101 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int");
102 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int");
103 LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *",
104     "struct waiting_proc **", "struct futex **");
105 LIN_SDT_PROBE_DEFINE0(futex, futex_get, error);
106 LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int");
107 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *",
108     "struct waiting_proc **", "int");
109 LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *",
110     "struct waiting_proc *", "uint32_t *", "uint32_t");
111 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *",
112     "struct waiting_proc *");
113 LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int");
114 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int",
115     "uint32_t");
116 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t",
117     "struct waiting_proc *", "uint32_t");
118 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *");
119 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int");
120 LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int",
121     "struct futex *", "int");
122 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *");
123 LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *",
124     "struct waiting_proc *", "uint32_t");
125 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int");
126 LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *",
127     "struct waiting_proc **", "int", "uint32_t");
128 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int");
129 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int");
130 LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *",
131     "int", "uint32_t");
132 LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int",
133     "int");
134 LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check);
135 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int");
136 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int");
137 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int");
138 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *",
139     "struct linux_sys_futex_args *");
140 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch);
141 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, itimerfix_error, "int");
142 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int");
143 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use);
144 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *",
145     "uint32_t", "uint32_t");
146 LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq,
147     "uint32_t *", "uint32_t", "int", "uint32_t");
148 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *",
149     "uint32_t", "uint32_t");
150 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *",
151     "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *");
152 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq,
153     "uint32_t", "int");
154 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *",
155     "int", "uint32_t", "uint32_t *", "uint32_t");
156 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault);
157 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi);
158 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi);
159 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi);
160 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue);
161 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi);
162 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi);
163 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int");
164 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int");
165 LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *",
166     "struct linux_set_robust_list_args *");
167 LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error);
168 LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int");
169 LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *",
170     "struct linux_get_robust_list_args *");
171 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int");
172 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int");
173 LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry,
174     "struct linux_emuldata *", "uint32_t *", "unsigned int");
175 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int");
176 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int");
177 LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry,
178     "struct linux_robust_list **", "struct linux_robust_list **",
179     "unsigned int *");
180 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int");
181 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int");
182 LIN_SDT_PROBE_DEFINE2(futex, release_futexes, entry, "struct thread *",
183     "struct linux_emuldata *");
184 LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int");
185 LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return);
186 
187 struct futex;
188 
189 struct waiting_proc {
190 	uint32_t	wp_flags;
191 	struct futex	*wp_futex;
192 	TAILQ_ENTRY(waiting_proc) wp_list;
193 };
194 
195 struct futex {
196 	struct sx	f_lck;
197 	uint32_t	*f_uaddr;	/* user-supplied value, for debug */
198 	struct umtx_key	f_key;
199 	uint32_t	f_refcount;
200 	uint32_t	f_bitset;
201 	LIST_ENTRY(futex) f_list;
202 	TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
203 };
204 
205 struct futex_list futex_list;
206 
207 #define FUTEX_LOCK(f)		sx_xlock(&(f)->f_lck)
208 #define FUTEX_UNLOCK(f)		sx_xunlock(&(f)->f_lck)
209 #define FUTEX_INIT(f)		do { \
210 				    sx_init_flags(&(f)->f_lck, "ftlk", \
211 					SX_DUPOK); \
212 				    LIN_SDT_PROBE1(futex, futex, create, \
213 					&(f)->f_lck); \
214 				} while (0)
215 #define FUTEX_DESTROY(f)	do { \
216 				    LIN_SDT_PROBE1(futex, futex, destroy, \
217 					&(f)->f_lck); \
218 				    sx_destroy(&(f)->f_lck); \
219 				} while (0)
220 #define FUTEX_ASSERT_LOCKED(f)	sx_assert(&(f)->f_lck, SA_XLOCKED)
221 
222 struct mtx futex_mtx;			/* protects the futex list */
223 #define FUTEXES_LOCK		do { \
224 				    mtx_lock(&futex_mtx); \
225 				    LIN_SDT_PROBE1(locks, futex_mtx, \
226 					locked, &futex_mtx); \
227 				} while (0)
228 #define FUTEXES_UNLOCK		do { \
229 				    LIN_SDT_PROBE1(locks, futex_mtx, \
230 					unlock, &futex_mtx); \
231 				    mtx_unlock(&futex_mtx); \
232 				} while (0)
233 
234 /* flags for futex_get() */
235 #define FUTEX_CREATE_WP		0x1	/* create waiting_proc */
236 #define FUTEX_DONTCREATE	0x2	/* don't create futex if not exists */
237 #define FUTEX_DONTEXISTS	0x4	/* return EINVAL if futex exists */
238 #define	FUTEX_SHARED		0x8	/* shared futex */
239 
240 /* wp_flags */
241 #define FUTEX_WP_REQUEUED	0x1	/* wp requeued - wp moved from wp_list
242 					 * of futex where thread sleep to wp_list
243 					 * of another futex.
244 					 */
245 #define FUTEX_WP_REMOVED	0x2	/* wp is woken up and removed from futex
246 					 * wp_list to prevent double wakeup.
247 					 */
248 
249 /* support.s */
250 int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval);
251 int futex_addl(int oparg, uint32_t *uaddr, int *oldval);
252 int futex_orl(int oparg, uint32_t *uaddr, int *oldval);
253 int futex_andl(int oparg, uint32_t *uaddr, int *oldval);
254 int futex_xorl(int oparg, uint32_t *uaddr, int *oldval);
255 
256 static void
257 futex_put(struct futex *f, struct waiting_proc *wp)
258 {
259 	LIN_SDT_PROBE2(futex, futex_put, entry, f, wp);
260 
261 	FUTEX_ASSERT_LOCKED(f);
262 	if (wp != NULL) {
263 		if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0)
264 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
265 		free(wp, M_FUTEX_WP);
266 	}
267 
268 	FUTEXES_LOCK;
269 	if (--f->f_refcount == 0) {
270 		LIST_REMOVE(f, f_list);
271 		FUTEXES_UNLOCK;
272 		FUTEX_UNLOCK(f);
273 
274 		LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr,
275 		    f->f_refcount, f->f_key.shared);
276 		LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d "
277 		    "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared);
278 		umtx_key_release(&f->f_key);
279 		FUTEX_DESTROY(f);
280 		free(f, M_FUTEX);
281 
282 		LIN_SDT_PROBE0(futex, futex_put, return);
283 		return;
284 	}
285 
286 	LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount,
287 	    f->f_key.shared);
288 	LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d",
289 	    f->f_uaddr, f->f_refcount, f->f_key.shared);
290 	FUTEXES_UNLOCK;
291 	FUTEX_UNLOCK(f);
292 
293 	LIN_SDT_PROBE0(futex, futex_put, return);
294 }
295 
296 static int
297 futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags)
298 {
299 	struct futex *f, *tmpf;
300 	struct umtx_key key;
301 	int error;
302 
303 	LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags);
304 
305 	*newf = tmpf = NULL;
306 
307 	error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ?
308 	    AUTO_SHARE : THREAD_SHARE, &key);
309 	if (error) {
310 		LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error);
311 		LIN_SDT_PROBE1(futex, futex_get0, return, error);
312 		return (error);
313 	}
314 retry:
315 	FUTEXES_LOCK;
316 	LIST_FOREACH(f, &futex_list, f_list) {
317 		if (umtx_key_match(&f->f_key, &key)) {
318 			if (tmpf != NULL) {
319 				FUTEX_UNLOCK(tmpf);
320 				FUTEX_DESTROY(tmpf);
321 				free(tmpf, M_FUTEX);
322 			}
323 			if (flags & FUTEX_DONTEXISTS) {
324 				FUTEXES_UNLOCK;
325 				umtx_key_release(&key);
326 
327 				LIN_SDT_PROBE1(futex, futex_get0, return,
328 				    EINVAL);
329 				return (EINVAL);
330 			}
331 
332 			/*
333 			 * Increment refcount of the found futex to
334 			 * prevent it from deallocation before FUTEX_LOCK()
335 			 */
336 			++f->f_refcount;
337 			FUTEXES_UNLOCK;
338 			umtx_key_release(&key);
339 
340 			FUTEX_LOCK(f);
341 			*newf = f;
342 			LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr,
343 			    f->f_refcount, f->f_key.shared);
344 			LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d",
345 			    uaddr, f->f_refcount, f->f_key.shared);
346 
347 			LIN_SDT_PROBE1(futex, futex_get0, return, 0);
348 			return (0);
349 		}
350 	}
351 
352 	if (flags & FUTEX_DONTCREATE) {
353 		FUTEXES_UNLOCK;
354 		umtx_key_release(&key);
355 		LIN_SDT_PROBE1(futex, futex_get0, null, uaddr);
356 		LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr);
357 
358 		LIN_SDT_PROBE1(futex, futex_get0, return, 0);
359 		return (0);
360 	}
361 
362 	if (tmpf == NULL) {
363 		FUTEXES_UNLOCK;
364 		tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO);
365 		tmpf->f_uaddr = uaddr;
366 		tmpf->f_key = key;
367 		tmpf->f_refcount = 1;
368 		tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY;
369 		FUTEX_INIT(tmpf);
370 		TAILQ_INIT(&tmpf->f_waiting_proc);
371 
372 		/*
373 		 * Lock the new futex before an insert into the futex_list
374 		 * to prevent futex usage by other.
375 		 */
376 		FUTEX_LOCK(tmpf);
377 		goto retry;
378 	}
379 
380 	LIST_INSERT_HEAD(&futex_list, tmpf, f_list);
381 	FUTEXES_UNLOCK;
382 
383 	LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount,
384 	    tmpf->f_key.shared);
385 	LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new",
386 	    uaddr, tmpf->f_refcount, tmpf->f_key.shared);
387 	*newf = tmpf;
388 
389 	LIN_SDT_PROBE1(futex, futex_get0, return, 0);
390 	return (0);
391 }
392 
393 static int
394 futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f,
395     uint32_t flags)
396 {
397 	int error;
398 
399 	LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f);
400 
401 	if (flags & FUTEX_CREATE_WP) {
402 		*wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK);
403 		(*wp)->wp_flags = 0;
404 	}
405 	error = futex_get0(uaddr, f, flags);
406 	if (error) {
407 		LIN_SDT_PROBE0(futex, futex_get, error);
408 
409 		if (flags & FUTEX_CREATE_WP)
410 			free(*wp, M_FUTEX_WP);
411 
412 		LIN_SDT_PROBE1(futex, futex_get, return, error);
413 		return (error);
414 	}
415 	if (flags & FUTEX_CREATE_WP) {
416 		TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list);
417 		(*wp)->wp_futex = *f;
418 	}
419 
420 	LIN_SDT_PROBE1(futex, futex_get, return, error);
421 	return (error);
422 }
423 
424 static int
425 futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout)
426 {
427 	int error;
428 
429 	FUTEX_ASSERT_LOCKED(f);
430 	LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, timeout);
431 	LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d",
432 	    f->f_uaddr, wp, timeout, f->f_refcount);
433 	error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout);
434 	if (wp->wp_flags & FUTEX_WP_REQUEUED) {
435 		KASSERT(f != wp->wp_futex, ("futex != wp_futex"));
436 
437 		if (error) {
438 			LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error,
439 			    f->f_uaddr, wp, wp->wp_futex->f_uaddr,
440 			    wp->wp_futex->f_refcount);
441 		}
442 
443 		LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp"
444 		    " %p requeued uaddr %p ref %d",
445 		    error, f->f_uaddr, wp, wp->wp_futex->f_uaddr,
446 		    wp->wp_futex->f_refcount);
447 		futex_put(f, NULL);
448 		f = wp->wp_futex;
449 		FUTEX_LOCK(f);
450 	} else {
451 		if (error) {
452 			LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error,
453 			    f->f_uaddr, wp);
454 		}
455 		LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p",
456 		    error, f->f_uaddr, wp);
457 	}
458 
459 	futex_put(f, wp);
460 
461 	LIN_SDT_PROBE1(futex, futex_sleep, return, error);
462 	return (error);
463 }
464 
465 static int
466 futex_wake(struct futex *f, int n, uint32_t bitset)
467 {
468 	struct waiting_proc *wp, *wpt;
469 	int count = 0;
470 
471 	LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset);
472 
473 	if (bitset == 0) {
474 		LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL);
475 		return (EINVAL);
476 	}
477 
478 	FUTEX_ASSERT_LOCKED(f);
479 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
480 		LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp,
481 		    f->f_refcount);
482 		LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d",
483 		    f->f_uaddr, wp, f->f_refcount);
484 		/*
485 		 * Unless we find a matching bit in
486 		 * the bitset, continue searching.
487 		 */
488 		if (!(wp->wp_futex->f_bitset & bitset))
489 			continue;
490 
491 		wp->wp_flags |= FUTEX_WP_REMOVED;
492 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
493 		LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp);
494 		wakeup_one(wp);
495 		if (++count == n)
496 			break;
497 	}
498 
499 	LIN_SDT_PROBE1(futex, futex_wake, return, count);
500 	return (count);
501 }
502 
503 static int
504 futex_requeue(struct futex *f, int n, struct futex *f2, int n2)
505 {
506 	struct waiting_proc *wp, *wpt;
507 	int count = 0;
508 
509 	LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2);
510 
511 	FUTEX_ASSERT_LOCKED(f);
512 	FUTEX_ASSERT_LOCKED(f2);
513 
514 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
515 		if (++count <= n) {
516 			LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p",
517 			    f->f_uaddr, wp);
518 			wp->wp_flags |= FUTEX_WP_REMOVED;
519 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
520 			LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp);
521 			wakeup_one(wp);
522 		} else {
523 			LIN_SDT_PROBE3(futex, futex_requeue, requeue,
524 			    f->f_uaddr, wp, f2->f_uaddr);
525 			LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p",
526 			    f->f_uaddr, wp, f2->f_uaddr);
527 			wp->wp_flags |= FUTEX_WP_REQUEUED;
528 			/* Move wp to wp_list of f2 futex */
529 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
530 			TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list);
531 
532 			/*
533 			 * Thread which sleeps on wp after waking should
534 			 * acquire f2 lock, so increment refcount of f2 to
535 			 * prevent it from premature deallocation.
536 			 */
537 			wp->wp_futex = f2;
538 			FUTEXES_LOCK;
539 			++f2->f_refcount;
540 			FUTEXES_UNLOCK;
541 			if (count - n >= n2)
542 				break;
543 		}
544 	}
545 
546 	LIN_SDT_PROBE1(futex, futex_requeue, return, count);
547 	return (count);
548 }
549 
550 static int
551 futex_wait(struct futex *f, struct waiting_proc *wp, int timeout_hz,
552     uint32_t bitset)
553 {
554 	int error;
555 
556 	LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, timeout_hz, bitset);
557 
558 	if (bitset == 0) {
559 		LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL);
560 		return (EINVAL);
561 	}
562 
563 	f->f_bitset = bitset;
564 	error = futex_sleep(f, wp, timeout_hz);
565 	if (error)
566 		LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error);
567 	if (error == EWOULDBLOCK)
568 		error = ETIMEDOUT;
569 
570 	LIN_SDT_PROBE1(futex, futex_wait, return, error);
571 	return (error);
572 }
573 
574 static int
575 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
576 {
577 	int op = (encoded_op >> 28) & 7;
578 	int cmp = (encoded_op >> 24) & 15;
579 	int oparg = (encoded_op << 8) >> 20;
580 	int cmparg = (encoded_op << 20) >> 20;
581 	int oldval = 0, ret;
582 
583 	LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr);
584 
585 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
586 		oparg = 1 << oparg;
587 
588 	LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg,
589 	    cmparg);
590 
591 	/* XXX: Linux verifies access here and returns EFAULT */
592 	LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check);
593 
594 	switch (op) {
595 	case FUTEX_OP_SET:
596 		ret = futex_xchgl(oparg, uaddr, &oldval);
597 		break;
598 	case FUTEX_OP_ADD:
599 		ret = futex_addl(oparg, uaddr, &oldval);
600 		break;
601 	case FUTEX_OP_OR:
602 		ret = futex_orl(oparg, uaddr, &oldval);
603 		break;
604 	case FUTEX_OP_ANDN:
605 		ret = futex_andl(~oparg, uaddr, &oldval);
606 		break;
607 	case FUTEX_OP_XOR:
608 		ret = futex_xorl(oparg, uaddr, &oldval);
609 		break;
610 	default:
611 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op);
612 		ret = -ENOSYS;
613 		break;
614 	}
615 
616 	if (ret) {
617 		LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
618 		return (ret);
619 	}
620 
621 	switch (cmp) {
622 	case FUTEX_OP_CMP_EQ:
623 		ret = (oldval == cmparg);
624 		break;
625 	case FUTEX_OP_CMP_NE:
626 		ret = (oldval != cmparg);
627 		break;
628 	case FUTEX_OP_CMP_LT:
629 		ret = (oldval < cmparg);
630 		break;
631 	case FUTEX_OP_CMP_GE:
632 		ret = (oldval >= cmparg);
633 		break;
634 	case FUTEX_OP_CMP_LE:
635 		ret = (oldval <= cmparg);
636 		break;
637 	case FUTEX_OP_CMP_GT:
638 		ret = (oldval > cmparg);
639 		break;
640 	default:
641 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp);
642 		ret = -ENOSYS;
643 	}
644 
645 	LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
646 	return (ret);
647 }
648 
649 int
650 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
651 {
652 	int clockrt, nrwake, op_ret, ret;
653 	struct linux_pemuldata *pem;
654 	struct waiting_proc *wp;
655 	struct futex *f, *f2;
656 	struct l_timespec timeout;
657 	struct timeval utv, ctv;
658 	int timeout_hz;
659 	int error;
660 	uint32_t flags, val;
661 
662 	LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args);
663 
664 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
665 		flags = 0;
666 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
667 	} else
668 		flags = FUTEX_SHARED;
669 
670 	/*
671 	 * Currently support for switching between CLOCK_MONOTONIC and
672 	 * CLOCK_REALTIME is not present. However Linux forbids the use of
673 	 * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and
674 	 * FUTEX_WAIT_REQUEUE_PI.
675 	 */
676 	clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
677 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
678 	if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET &&
679 		args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) {
680 		LIN_SDT_PROBE0(futex, linux_sys_futex,
681 		    unimplemented_clockswitch);
682 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
683 		return (ENOSYS);
684 	}
685 
686 	error = 0;
687 	f = f2 = NULL;
688 
689 	switch (args->op) {
690 	case LINUX_FUTEX_WAIT:
691 		args->val3 = FUTEX_BITSET_MATCH_ANY;
692 		/* FALLTHROUGH */
693 
694 	case LINUX_FUTEX_WAIT_BITSET:
695 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr,
696 		    args->val, args->val3);
697 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
698 		    args->uaddr, args->val, args->val3);
699 
700 		error = futex_get(args->uaddr, &wp, &f,
701 		    flags | FUTEX_CREATE_WP);
702 		if (error) {
703 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
704 			return (error);
705 		}
706 
707 		error = copyin(args->uaddr, &val, sizeof(val));
708 		if (error) {
709 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
710 			    error);
711 			LINUX_CTR1(sys_futex, "WAIT copyin failed %d",
712 			    error);
713 			futex_put(f, wp);
714 
715 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
716 			return (error);
717 		}
718 		if (val != args->val) {
719 			LIN_SDT_PROBE4(futex, linux_sys_futex,
720 			    debug_wait_value_neq, args->uaddr, args->val, val,
721 			    args->val3);
722 			LINUX_CTR3(sys_futex,
723 			    "WAIT uaddr %p val 0x%x != uval 0x%x",
724 			    args->uaddr, args->val, val);
725 			futex_put(f, wp);
726 
727 			LIN_SDT_PROBE1(futex, linux_sys_futex, return,
728 			    EWOULDBLOCK);
729 			return (EWOULDBLOCK);
730 		}
731 
732 		if (args->timeout != NULL) {
733 			error = copyin(args->timeout, &timeout, sizeof(timeout));
734 			if (error) {
735 				LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
736 				    error);
737 				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
738 				futex_put(f, wp);
739 				return (error);
740 			}
741 			TIMESPEC_TO_TIMEVAL(&utv, &timeout);
742 			error = itimerfix(&utv);
743 			if (error) {
744 				LIN_SDT_PROBE1(futex, linux_sys_futex, itimerfix_error,
745 				    error);
746 				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
747 				futex_put(f, wp);
748 				return (error);
749 			}
750 			if (clockrt) {
751 				microtime(&ctv);
752 				timevalsub(&utv, &ctv);
753 			} else if (args->op == LINUX_FUTEX_WAIT_BITSET) {
754 				microuptime(&ctv);
755 				timevalsub(&utv, &ctv);
756 			}
757 			if (utv.tv_sec < 0)
758 				timevalclear(&utv);
759 			timeout_hz = tvtohz(&utv);
760 		} else
761 			timeout_hz = 0;
762 
763 		error = futex_wait(f, wp, timeout_hz, args->val3);
764 		break;
765 
766 	case LINUX_FUTEX_WAKE:
767 		args->val3 = FUTEX_BITSET_MATCH_ANY;
768 		/* FALLTHROUGH */
769 
770 	case LINUX_FUTEX_WAKE_BITSET:
771 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr,
772 		    args->val, args->val3);
773 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
774 		    args->uaddr, args->val, args->val3);
775 
776 		error = futex_get(args->uaddr, NULL, &f,
777 		    flags | FUTEX_DONTCREATE);
778 		if (error) {
779 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
780 			return (error);
781 		}
782 
783 		if (f == NULL) {
784 			td->td_retval[0] = 0;
785 
786 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
787 			return (error);
788 		}
789 		td->td_retval[0] = futex_wake(f, args->val, args->val3);
790 		futex_put(f, NULL);
791 		break;
792 
793 	case LINUX_FUTEX_CMP_REQUEUE:
794 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue,
795 		    args->uaddr, args->val, args->val3, args->uaddr2,
796 		    args->timeout);
797 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
798 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
799 		    args->uaddr, args->val, args->val3, args->uaddr2,
800 		    args->timeout);
801 
802 		/*
803 		 * Linux allows this, we would not, it is an incorrect
804 		 * usage of declared ABI, so return EINVAL.
805 		 */
806 		if (args->uaddr == args->uaddr2) {
807 			LIN_SDT_PROBE0(futex, linux_sys_futex,
808 			    invalid_cmp_requeue_use);
809 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
810 			return (EINVAL);
811 		}
812 
813 		error = futex_get(args->uaddr, NULL, &f, flags);
814 		if (error) {
815 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
816 			return (error);
817 		}
818 
819 		/*
820 		 * To avoid deadlocks return EINVAL if second futex
821 		 * exists at this time.
822 		 *
823 		 * Glibc fall back to FUTEX_WAKE in case of any error
824 		 * returned by FUTEX_CMP_REQUEUE.
825 		 */
826 		error = futex_get(args->uaddr2, NULL, &f2,
827 		    flags | FUTEX_DONTEXISTS);
828 		if (error) {
829 			futex_put(f, NULL);
830 
831 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
832 			return (error);
833 		}
834 		error = copyin(args->uaddr, &val, sizeof(val));
835 		if (error) {
836 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
837 			    error);
838 			LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d",
839 			    error);
840 			futex_put(f2, NULL);
841 			futex_put(f, NULL);
842 
843 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
844 			return (error);
845 		}
846 		if (val != args->val3) {
847 			LIN_SDT_PROBE2(futex, linux_sys_futex,
848 			    debug_cmp_requeue_value_neq, args->val, val);
849 			LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x",
850 			    args->val, val);
851 			futex_put(f2, NULL);
852 			futex_put(f, NULL);
853 
854 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN);
855 			return (EAGAIN);
856 		}
857 
858 		nrwake = (int)(unsigned long)args->timeout;
859 		td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake);
860 		futex_put(f2, NULL);
861 		futex_put(f, NULL);
862 		break;
863 
864 	case LINUX_FUTEX_WAKE_OP:
865 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op,
866 		    args->uaddr, args->op, args->val, args->uaddr2, args->val3);
867 		LINUX_CTR5(sys_futex, "WAKE_OP "
868 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
869 		    args->uaddr, args->val, args->uaddr2, args->val3,
870 		    args->timeout);
871 
872 		error = futex_get(args->uaddr, NULL, &f, flags);
873 		if (error) {
874 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
875 			return (error);
876 		}
877 
878 		if (args->uaddr != args->uaddr2)
879 			error = futex_get(args->uaddr2, NULL, &f2, flags);
880 		if (error) {
881 			futex_put(f, NULL);
882 
883 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
884 			return (error);
885 		}
886 
887 		/*
888 		 * This function returns positive number as results and
889 		 * negative as errors
890 		 */
891 		op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
892 
893 		LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x",
894 		    args->uaddr, op_ret);
895 
896 		if (op_ret < 0) {
897 			/* XXX: We don't handle the EFAULT yet. */
898 			if (op_ret != -EFAULT) {
899 				if (f2 != NULL)
900 					futex_put(f2, NULL);
901 				futex_put(f, NULL);
902 
903 				LIN_SDT_PROBE1(futex, linux_sys_futex, return,
904 				    -op_ret);
905 				return (-op_ret);
906 			} else {
907 				LIN_SDT_PROBE0(futex, linux_sys_futex,
908 				    unhandled_efault);
909 			}
910 			if (f2 != NULL)
911 				futex_put(f2, NULL);
912 			futex_put(f, NULL);
913 
914 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EFAULT);
915 			return (EFAULT);
916 		}
917 
918 		ret = futex_wake(f, args->val, args->val3);
919 
920 		if (op_ret > 0) {
921 			op_ret = 0;
922 			nrwake = (int)(unsigned long)args->timeout;
923 
924 			if (f2 != NULL)
925 				op_ret += futex_wake(f2, nrwake, args->val3);
926 			else
927 				op_ret += futex_wake(f, nrwake, args->val3);
928 			ret += op_ret;
929 
930 		}
931 		if (f2 != NULL)
932 			futex_put(f2, NULL);
933 		futex_put(f, NULL);
934 		td->td_retval[0] = ret;
935 		break;
936 
937 	case LINUX_FUTEX_LOCK_PI:
938 		/* not yet implemented */
939 		linux_msg(td,
940 			  "linux_sys_futex: "
941 			  "op LINUX_FUTEX_LOCK_PI not implemented\n");
942 		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi);
943 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
944 		return (ENOSYS);
945 
946 	case LINUX_FUTEX_UNLOCK_PI:
947 		/* not yet implemented */
948 		linux_msg(td,
949 			  "linux_sys_futex: "
950 			  "op LINUX_FUTEX_UNLOCK_PI not implemented\n");
951 		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi);
952 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
953 		return (ENOSYS);
954 
955 	case LINUX_FUTEX_TRYLOCK_PI:
956 		/* not yet implemented */
957 		linux_msg(td,
958 			  "linux_sys_futex: "
959 			  "op LINUX_FUTEX_TRYLOCK_PI not implemented\n");
960 		LIN_SDT_PROBE0(futex, linux_sys_futex,
961 		    unimplemented_trylock_pi);
962 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
963 		return (ENOSYS);
964 
965 	case LINUX_FUTEX_REQUEUE:
966 
967 		/*
968 		 * Glibc does not use this operation since version 2.3.3,
969 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
970 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
971 		 * FUTEX_REQUEUE returned EINVAL.
972 		 */
973 		pem = pem_find(td->td_proc);
974 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
975 			linux_msg(td,
976 				  "linux_sys_futex: "
977 				  "unsupported futex_requeue op\n");
978 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
979 			LIN_SDT_PROBE0(futex, linux_sys_futex,
980 			    deprecated_requeue);
981 		}
982 
983 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
984 		return (EINVAL);
985 
986 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
987 		/* not yet implemented */
988 		linux_msg(td,
989 			  "linux_sys_futex: "
990 			  "op FUTEX_WAIT_REQUEUE_PI not implemented\n");
991 		LIN_SDT_PROBE0(futex, linux_sys_futex,
992 		    unimplemented_wait_requeue_pi);
993 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
994 		return (ENOSYS);
995 
996 	case LINUX_FUTEX_CMP_REQUEUE_PI:
997 		/* not yet implemented */
998 		linux_msg(td,
999 			    "linux_sys_futex: "
1000 			    "op LINUX_FUTEX_CMP_REQUEUE_PI not implemented\n");
1001 		LIN_SDT_PROBE0(futex, linux_sys_futex,
1002 		    unimplemented_cmp_requeue_pi);
1003 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1004 		return (ENOSYS);
1005 
1006 	default:
1007 		linux_msg(td,
1008 			  "linux_sys_futex: unknown op %d\n", args->op);
1009 		LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation,
1010 		    args->op);
1011 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1012 		return (ENOSYS);
1013 	}
1014 
1015 	LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
1016 	return (error);
1017 }
1018 
1019 int
1020 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
1021 {
1022 	struct linux_emuldata *em;
1023 
1024 	LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args);
1025 
1026 	if (args->len != sizeof(struct linux_robust_list_head)) {
1027 		LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error);
1028 		LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL);
1029 		return (EINVAL);
1030 	}
1031 
1032 	em = em_find(td);
1033 	em->robust_futexes = args->head;
1034 
1035 	LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0);
1036 	return (0);
1037 }
1038 
1039 int
1040 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
1041 {
1042 	struct linux_emuldata *em;
1043 	struct linux_robust_list_head *head;
1044 	l_size_t len = sizeof(struct linux_robust_list_head);
1045 	struct thread *td2;
1046 	int error = 0;
1047 
1048 	LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args);
1049 
1050 	if (!args->pid) {
1051 		em = em_find(td);
1052 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
1053 		head = em->robust_futexes;
1054 	} else {
1055 		td2 = tdfind(args->pid, -1);
1056 		if (td2 == NULL) {
1057 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
1058 			    ESRCH);
1059 			return (ESRCH);
1060 		}
1061 
1062 		em = em_find(td2);
1063 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
1064 		/* XXX: ptrace? */
1065 		if (priv_check(td, PRIV_CRED_SETUID) ||
1066 		    priv_check(td, PRIV_CRED_SETEUID) ||
1067 		    p_candebug(td, td2->td_proc)) {
1068 			PROC_UNLOCK(td2->td_proc);
1069 
1070 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
1071 			    EPERM);
1072 			return (EPERM);
1073 		}
1074 		head = em->robust_futexes;
1075 
1076 		PROC_UNLOCK(td2->td_proc);
1077 	}
1078 
1079 	error = copyout(&len, args->len, sizeof(l_size_t));
1080 	if (error) {
1081 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
1082 		    error);
1083 		LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT);
1084 		return (EFAULT);
1085 	}
1086 
1087 	error = copyout(head, args->head, sizeof(struct linux_robust_list_head));
1088 	if (error) {
1089 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
1090 		    error);
1091 	}
1092 
1093 	LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error);
1094 	return (error);
1095 }
1096 
1097 static int
1098 handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr,
1099     unsigned int pi)
1100 {
1101 	uint32_t uval, nval, mval;
1102 	struct futex *f;
1103 	int error;
1104 
1105 	LIN_SDT_PROBE3(futex, handle_futex_death, entry, em, uaddr, pi);
1106 
1107 retry:
1108 	error = copyin(uaddr, &uval, 4);
1109 	if (error) {
1110 		LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error);
1111 		LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT);
1112 		return (EFAULT);
1113 	}
1114 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
1115 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1116 		nval = casuword32(uaddr, uval, mval);
1117 
1118 		if (nval == -1) {
1119 			LIN_SDT_PROBE1(futex, handle_futex_death, return,
1120 			    EFAULT);
1121 			return (EFAULT);
1122 		}
1123 
1124 		if (nval != uval)
1125 			goto retry;
1126 
1127 		if (!pi && (uval & FUTEX_WAITERS)) {
1128 			error = futex_get(uaddr, NULL, &f,
1129 			    FUTEX_DONTCREATE | FUTEX_SHARED);
1130 			if (error) {
1131 				LIN_SDT_PROBE1(futex, handle_futex_death,
1132 				    return, error);
1133 				return (error);
1134 			}
1135 			if (f != NULL) {
1136 				futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY);
1137 				futex_put(f, NULL);
1138 			}
1139 		}
1140 	}
1141 
1142 	LIN_SDT_PROBE1(futex, handle_futex_death, return, 0);
1143 	return (0);
1144 }
1145 
1146 static int
1147 fetch_robust_entry(struct linux_robust_list **entry,
1148     struct linux_robust_list **head, unsigned int *pi)
1149 {
1150 	l_ulong uentry;
1151 	int error;
1152 
1153 	LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi);
1154 
1155 	error = copyin((const void *)head, &uentry, sizeof(l_ulong));
1156 	if (error) {
1157 		LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error);
1158 		LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT);
1159 		return (EFAULT);
1160 	}
1161 
1162 	*entry = (void *)(uentry & ~1UL);
1163 	*pi = uentry & 1;
1164 
1165 	LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0);
1166 	return (0);
1167 }
1168 
1169 /* This walks the list of robust futexes releasing them. */
1170 void
1171 release_futexes(struct thread *td, struct linux_emuldata *em)
1172 {
1173 	struct linux_robust_list_head *head = NULL;
1174 	struct linux_robust_list *entry, *next_entry, *pending;
1175 	unsigned int limit = 2048, pi, next_pi, pip;
1176 	l_long futex_offset;
1177 	int rc, error;
1178 
1179 	LIN_SDT_PROBE2(futex, release_futexes, entry, td, em);
1180 
1181 	head = em->robust_futexes;
1182 
1183 	if (head == NULL) {
1184 		LIN_SDT_PROBE0(futex, release_futexes, return);
1185 		return;
1186 	}
1187 
1188 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) {
1189 		LIN_SDT_PROBE0(futex, release_futexes, return);
1190 		return;
1191 	}
1192 
1193 	error = copyin(&head->futex_offset, &futex_offset,
1194 	    sizeof(futex_offset));
1195 	if (error) {
1196 		LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error);
1197 		LIN_SDT_PROBE0(futex, release_futexes, return);
1198 		return;
1199 	}
1200 
1201 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) {
1202 		LIN_SDT_PROBE0(futex, release_futexes, return);
1203 		return;
1204 	}
1205 
1206 	while (entry != &head->list) {
1207 		rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi);
1208 
1209 		if (entry != pending)
1210 			if (handle_futex_death(em,
1211 			    (uint32_t *)((caddr_t)entry + futex_offset), pi)) {
1212 				LIN_SDT_PROBE0(futex, release_futexes, return);
1213 				return;
1214 			}
1215 		if (rc) {
1216 			LIN_SDT_PROBE0(futex, release_futexes, return);
1217 			return;
1218 		}
1219 
1220 		entry = next_entry;
1221 		pi = next_pi;
1222 
1223 		if (!--limit)
1224 			break;
1225 
1226 		sched_relinquish(curthread);
1227 	}
1228 
1229 	if (pending)
1230 		handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip);
1231 
1232 	LIN_SDT_PROBE0(futex, release_futexes, return);
1233 }
1234