xref: /freebsd/sys/compat/linux/linux_futex.c (revision 545ddfbe7d4fe8adfb862903b24eac1d5896c1ef)
1 /*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
2 
3 /*-
4  * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *	This product includes software developed by Emmanuel Dreyfus
17  * 4. The name of the author may not be used to endorse or promote
18  *    products derived from this software without specific prior written
19  *    permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 #if 0
37 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
38 #endif
39 
40 #include "opt_compat.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/imgact.h>
45 #include <sys/kernel.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mutex.h>
50 #include <sys/priv.h>
51 #include <sys/proc.h>
52 #include <sys/queue.h>
53 #include <sys/sched.h>
54 #include <sys/sdt.h>
55 #include <sys/sx.h>
56 #include <sys/umtx.h>
57 
58 #ifdef COMPAT_LINUX32
59 #include <machine/../linux32/linux.h>
60 #include <machine/../linux32/linux32_proto.h>
61 #else
62 #include <machine/../linux/linux.h>
63 #include <machine/../linux/linux_proto.h>
64 #endif
65 #include <compat/linux/linux_dtrace.h>
66 #include <compat/linux/linux_emul.h>
67 #include <compat/linux/linux_futex.h>
68 #include <compat/linux/linux_util.h>
69 
70 /* DTrace init */
71 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
72 
73 /* Linuxulator-global DTrace probes */
74 LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked);
75 LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock);
76 
77 /**
78  * Futex part for the special DTrace module "locks".
79  */
80 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *");
81 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *");
82 
83 /**
84  * Per futex probes.
85  */
86 LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *");
87 LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *");
88 
89 /**
90  * DTrace probes in this module.
91  */
92 LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *",
93     "struct waiting_proc *");
94 LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t",
95     "int");
96 LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t",
97     "int");
98 LIN_SDT_PROBE_DEFINE0(futex, futex_put, return);
99 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **",
100     "uint32_t");
101 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int");
102 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t",
103     "int");
104 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *");
105 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int");
106 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int");
107 LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *",
108     "struct waiting_proc **", "struct futex **");
109 LIN_SDT_PROBE_DEFINE0(futex, futex_get, error);
110 LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int");
111 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *",
112     "struct waiting_proc **", "int");
113 LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *",
114     "struct waiting_proc *", "uint32_t *", "uint32_t");
115 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *",
116     "struct waiting_proc *");
117 LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int");
118 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int",
119     "uint32_t");
120 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t",
121     "struct waiting_proc *", "uint32_t");
122 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *");
123 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int");
124 LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int",
125     "struct futex *", "int");
126 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *");
127 LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *",
128     "struct waiting_proc *", "uint32_t");
129 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int");
130 LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *",
131     "struct waiting_proc **", "int", "uint32_t");
132 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int");
133 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int");
134 LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *",
135     "int", "uint32_t");
136 LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int",
137     "int");
138 LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check);
139 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int");
140 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int");
141 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int");
142 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *",
143     "struct linux_sys_futex_args *");
144 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch);
145 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, itimerfix_error, "int");
146 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int");
147 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use);
148 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *",
149     "uint32_t", "uint32_t");
150 LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq,
151     "uint32_t *", "uint32_t", "int", "uint32_t");
152 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *",
153     "uint32_t", "uint32_t");
154 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *",
155     "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *");
156 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq,
157     "uint32_t", "int");
158 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *",
159     "int", "uint32_t", "uint32_t *", "uint32_t");
160 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault);
161 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi);
162 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi);
163 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi);
164 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue);
165 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi);
166 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi);
167 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int");
168 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int");
169 LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *",
170     "struct linux_set_robust_list_args *");
171 LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error);
172 LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int");
173 LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *",
174     "struct linux_get_robust_list_args *");
175 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int");
176 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int");
177 LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct proc *",
178     "uint32_t *", "unsigned int");
179 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int");
180 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int");
181 LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry,
182     "struct linux_robust_list **", "struct linux_robust_list **",
183     "unsigned int *");
184 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int");
185 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int");
186 LIN_SDT_PROBE_DEFINE1(futex, release_futexes, entry, "struct proc *");
187 LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int");
188 LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return);
189 
190 static MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes");
191 static MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futexes wp");
192 
193 struct futex;
194 
195 struct waiting_proc {
196 	uint32_t	wp_flags;
197 	struct futex	*wp_futex;
198 	TAILQ_ENTRY(waiting_proc) wp_list;
199 };
200 
201 struct futex {
202 	struct sx	f_lck;
203 	uint32_t	*f_uaddr;	/* user-supplied value, for debug */
204 	struct umtx_key	f_key;
205 	uint32_t	f_refcount;
206 	uint32_t	f_bitset;
207 	LIST_ENTRY(futex) f_list;
208 	TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
209 };
210 
211 struct futex_list futex_list;
212 
213 #define FUTEX_LOCK(f)		sx_xlock(&(f)->f_lck)
214 #define FUTEX_UNLOCK(f)		sx_xunlock(&(f)->f_lck)
215 #define FUTEX_INIT(f)		do { \
216 				    sx_init_flags(&(f)->f_lck, "ftlk", \
217 					SX_DUPOK); \
218 				    LIN_SDT_PROBE1(futex, futex, create, \
219 					&(f)->f_lck); \
220 				} while (0)
221 #define FUTEX_DESTROY(f)	do { \
222 				    LIN_SDT_PROBE1(futex, futex, destroy, \
223 					&(f)->f_lck); \
224 				    sx_destroy(&(f)->f_lck); \
225 				} while (0)
226 #define FUTEX_ASSERT_LOCKED(f)	sx_assert(&(f)->f_lck, SA_XLOCKED)
227 
228 struct mtx futex_mtx;			/* protects the futex list */
229 #define FUTEXES_LOCK		do { \
230 				    mtx_lock(&futex_mtx); \
231 				    LIN_SDT_PROBE1(locks, futex_mtx, \
232 					locked, &futex_mtx); \
233 				} while (0)
234 #define FUTEXES_UNLOCK		do { \
235 				    LIN_SDT_PROBE1(locks, futex_mtx, \
236 					unlock, &futex_mtx); \
237 				    mtx_unlock(&futex_mtx); \
238 				} while (0)
239 
240 /* flags for futex_get() */
241 #define FUTEX_CREATE_WP		0x1	/* create waiting_proc */
242 #define FUTEX_DONTCREATE	0x2	/* don't create futex if not exists */
243 #define FUTEX_DONTEXISTS	0x4	/* return EINVAL if futex exists */
244 #define	FUTEX_SHARED		0x8	/* shared futex */
245 
246 /* wp_flags */
247 #define FUTEX_WP_REQUEUED	0x1	/* wp requeued - wp moved from wp_list
248 					 * of futex where thread sleep to wp_list
249 					 * of another futex.
250 					 */
251 #define FUTEX_WP_REMOVED	0x2	/* wp is woken up and removed from futex
252 					 * wp_list to prevent double wakeup.
253 					 */
254 
255 /* support.s */
256 int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval);
257 int futex_addl(int oparg, uint32_t *uaddr, int *oldval);
258 int futex_orl(int oparg, uint32_t *uaddr, int *oldval);
259 int futex_andl(int oparg, uint32_t *uaddr, int *oldval);
260 int futex_xorl(int oparg, uint32_t *uaddr, int *oldval);
261 
262 static void
263 futex_put(struct futex *f, struct waiting_proc *wp)
264 {
265 	LIN_SDT_PROBE2(futex, futex_put, entry, f, wp);
266 
267 	FUTEX_ASSERT_LOCKED(f);
268 	if (wp != NULL) {
269 		if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0)
270 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
271 		free(wp, M_FUTEX_WP);
272 	}
273 
274 	FUTEXES_LOCK;
275 	if (--f->f_refcount == 0) {
276 		LIST_REMOVE(f, f_list);
277 		FUTEXES_UNLOCK;
278 		FUTEX_UNLOCK(f);
279 
280 		LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr,
281 		    f->f_refcount, f->f_key.shared);
282 		LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d "
283 		    "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared);
284 		umtx_key_release(&f->f_key);
285 		FUTEX_DESTROY(f);
286 		free(f, M_FUTEX);
287 
288 		LIN_SDT_PROBE0(futex, futex_put, return);
289 		return;
290 	}
291 
292 	LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount,
293 	    f->f_key.shared);
294 	LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d",
295 	    f->f_uaddr, f->f_refcount, f->f_key.shared);
296 	FUTEXES_UNLOCK;
297 	FUTEX_UNLOCK(f);
298 
299 	LIN_SDT_PROBE0(futex, futex_put, return);
300 }
301 
302 static int
303 futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags)
304 {
305 	struct futex *f, *tmpf;
306 	struct umtx_key key;
307 	int error;
308 
309 	LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags);
310 
311 	*newf = tmpf = NULL;
312 
313 	error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ?
314 	    AUTO_SHARE : THREAD_SHARE, &key);
315 	if (error) {
316 		LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error);
317 		LIN_SDT_PROBE1(futex, futex_get0, return, error);
318 		return (error);
319 	}
320 retry:
321 	FUTEXES_LOCK;
322 	LIST_FOREACH(f, &futex_list, f_list) {
323 		if (umtx_key_match(&f->f_key, &key)) {
324 			if (tmpf != NULL) {
325 				FUTEX_UNLOCK(tmpf);
326 				FUTEX_DESTROY(tmpf);
327 				free(tmpf, M_FUTEX);
328 			}
329 			if (flags & FUTEX_DONTEXISTS) {
330 				FUTEXES_UNLOCK;
331 				umtx_key_release(&key);
332 
333 				LIN_SDT_PROBE1(futex, futex_get0, return,
334 				    EINVAL);
335 				return (EINVAL);
336 			}
337 
338 			/*
339 			 * Increment refcount of the found futex to
340 			 * prevent it from deallocation before FUTEX_LOCK()
341 			 */
342 			++f->f_refcount;
343 			FUTEXES_UNLOCK;
344 			umtx_key_release(&key);
345 
346 			FUTEX_LOCK(f);
347 			*newf = f;
348 			LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr,
349 			    f->f_refcount, f->f_key.shared);
350 			LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d",
351 			    uaddr, f->f_refcount, f->f_key.shared);
352 
353 			LIN_SDT_PROBE1(futex, futex_get0, return, 0);
354 			return (0);
355 		}
356 	}
357 
358 	if (flags & FUTEX_DONTCREATE) {
359 		FUTEXES_UNLOCK;
360 		umtx_key_release(&key);
361 		LIN_SDT_PROBE1(futex, futex_get0, null, uaddr);
362 		LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr);
363 
364 		LIN_SDT_PROBE1(futex, futex_get0, return, 0);
365 		return (0);
366 	}
367 
368 	if (tmpf == NULL) {
369 		FUTEXES_UNLOCK;
370 		tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO);
371 		tmpf->f_uaddr = uaddr;
372 		tmpf->f_key = key;
373 		tmpf->f_refcount = 1;
374 		tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY;
375 		FUTEX_INIT(tmpf);
376 		TAILQ_INIT(&tmpf->f_waiting_proc);
377 
378 		/*
379 		 * Lock the new futex before an insert into the futex_list
380 		 * to prevent futex usage by other.
381 		 */
382 		FUTEX_LOCK(tmpf);
383 		goto retry;
384 	}
385 
386 	LIST_INSERT_HEAD(&futex_list, tmpf, f_list);
387 	FUTEXES_UNLOCK;
388 
389 	LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount,
390 	    tmpf->f_key.shared);
391 	LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new",
392 	    uaddr, tmpf->f_refcount, tmpf->f_key.shared);
393 	*newf = tmpf;
394 
395 	LIN_SDT_PROBE1(futex, futex_get0, return, 0);
396 	return (0);
397 }
398 
399 static int
400 futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f,
401     uint32_t flags)
402 {
403 	int error;
404 
405 	LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f);
406 
407 	if (flags & FUTEX_CREATE_WP) {
408 		*wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK);
409 		(*wp)->wp_flags = 0;
410 	}
411 	error = futex_get0(uaddr, f, flags);
412 	if (error) {
413 		LIN_SDT_PROBE0(futex, futex_get, error);
414 
415 		if (flags & FUTEX_CREATE_WP)
416 			free(*wp, M_FUTEX_WP);
417 
418 		LIN_SDT_PROBE1(futex, futex_get, return, error);
419 		return (error);
420 	}
421 	if (flags & FUTEX_CREATE_WP) {
422 		TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list);
423 		(*wp)->wp_futex = *f;
424 	}
425 
426 	LIN_SDT_PROBE1(futex, futex_get, return, error);
427 	return (error);
428 }
429 
430 static int
431 futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout)
432 {
433 	int error;
434 
435 	FUTEX_ASSERT_LOCKED(f);
436 	LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, timeout);
437 	LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d",
438 	    f->f_uaddr, wp, timeout, f->f_refcount);
439 	error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout);
440 	if (wp->wp_flags & FUTEX_WP_REQUEUED) {
441 		KASSERT(f != wp->wp_futex, ("futex != wp_futex"));
442 
443 		if (error) {
444 			LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error,
445 			    f->f_uaddr, wp, wp->wp_futex->f_uaddr,
446 			    wp->wp_futex->f_refcount);
447 		}
448 
449 		LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp"
450 		    " %p requeued uaddr %p ref %d",
451 		    error, f->f_uaddr, wp, wp->wp_futex->f_uaddr,
452 		    wp->wp_futex->f_refcount);
453 		futex_put(f, NULL);
454 		f = wp->wp_futex;
455 		FUTEX_LOCK(f);
456 	} else {
457 		if (error) {
458 			LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error,
459 			    f->f_uaddr, wp);
460 		}
461 		LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p",
462 		    error, f->f_uaddr, wp);
463 	}
464 
465 	futex_put(f, wp);
466 
467 	LIN_SDT_PROBE1(futex, futex_sleep, return, error);
468 	return (error);
469 }
470 
471 static int
472 futex_wake(struct futex *f, int n, uint32_t bitset)
473 {
474 	struct waiting_proc *wp, *wpt;
475 	int count = 0;
476 
477 	LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset);
478 
479 	if (bitset == 0) {
480 		LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL);
481 		return (EINVAL);
482 	}
483 
484 	FUTEX_ASSERT_LOCKED(f);
485 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
486 		LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp,
487 		    f->f_refcount);
488 		LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d",
489 		    f->f_uaddr, wp, f->f_refcount);
490 		/*
491 		 * Unless we find a matching bit in
492 		 * the bitset, continue searching.
493 		 */
494 		if (!(wp->wp_futex->f_bitset & bitset))
495 			continue;
496 
497 		wp->wp_flags |= FUTEX_WP_REMOVED;
498 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
499 		LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp);
500 		wakeup_one(wp);
501 		if (++count == n)
502 			break;
503 	}
504 
505 	LIN_SDT_PROBE1(futex, futex_wake, return, count);
506 	return (count);
507 }
508 
509 static int
510 futex_requeue(struct futex *f, int n, struct futex *f2, int n2)
511 {
512 	struct waiting_proc *wp, *wpt;
513 	int count = 0;
514 
515 	LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2);
516 
517 	FUTEX_ASSERT_LOCKED(f);
518 	FUTEX_ASSERT_LOCKED(f2);
519 
520 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
521 		if (++count <= n) {
522 			LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p",
523 			    f->f_uaddr, wp);
524 			wp->wp_flags |= FUTEX_WP_REMOVED;
525 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
526 			LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp);
527 			wakeup_one(wp);
528 		} else {
529 			LIN_SDT_PROBE3(futex, futex_requeue, requeue,
530 			    f->f_uaddr, wp, f2->f_uaddr);
531 			LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p",
532 			    f->f_uaddr, wp, f2->f_uaddr);
533 			wp->wp_flags |= FUTEX_WP_REQUEUED;
534 			/* Move wp to wp_list of f2 futex */
535 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
536 			TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list);
537 
538 			/*
539 			 * Thread which sleeps on wp after waking should
540 			 * acquire f2 lock, so increment refcount of f2 to
541 			 * prevent it from premature deallocation.
542 			 */
543 			wp->wp_futex = f2;
544 			FUTEXES_LOCK;
545 			++f2->f_refcount;
546 			FUTEXES_UNLOCK;
547 			if (count - n >= n2)
548 				break;
549 		}
550 	}
551 
552 	LIN_SDT_PROBE1(futex, futex_requeue, return, count);
553 	return (count);
554 }
555 
556 static int
557 futex_wait(struct futex *f, struct waiting_proc *wp, int timeout_hz,
558     uint32_t bitset)
559 {
560 	int error;
561 
562 	LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, timeout_hz, bitset);
563 
564 	if (bitset == 0) {
565 		LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL);
566 		return (EINVAL);
567 	}
568 
569 	f->f_bitset = bitset;
570 	error = futex_sleep(f, wp, timeout_hz);
571 	if (error)
572 		LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error);
573 	if (error == EWOULDBLOCK)
574 		error = ETIMEDOUT;
575 
576 	LIN_SDT_PROBE1(futex, futex_wait, return, error);
577 	return (error);
578 }
579 
580 static int
581 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
582 {
583 	int op = (encoded_op >> 28) & 7;
584 	int cmp = (encoded_op >> 24) & 15;
585 	int oparg = (encoded_op << 8) >> 20;
586 	int cmparg = (encoded_op << 20) >> 20;
587 	int oldval = 0, ret;
588 
589 	LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr);
590 
591 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
592 		oparg = 1 << oparg;
593 
594 	LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg,
595 	    cmparg);
596 
597 	/* XXX: Linux verifies access here and returns EFAULT */
598 	LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check);
599 
600 	switch (op) {
601 	case FUTEX_OP_SET:
602 		ret = futex_xchgl(oparg, uaddr, &oldval);
603 		break;
604 	case FUTEX_OP_ADD:
605 		ret = futex_addl(oparg, uaddr, &oldval);
606 		break;
607 	case FUTEX_OP_OR:
608 		ret = futex_orl(oparg, uaddr, &oldval);
609 		break;
610 	case FUTEX_OP_ANDN:
611 		ret = futex_andl(~oparg, uaddr, &oldval);
612 		break;
613 	case FUTEX_OP_XOR:
614 		ret = futex_xorl(oparg, uaddr, &oldval);
615 		break;
616 	default:
617 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op);
618 		ret = -ENOSYS;
619 		break;
620 	}
621 
622 	if (ret) {
623 		LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
624 		return (ret);
625 	}
626 
627 	switch (cmp) {
628 	case FUTEX_OP_CMP_EQ:
629 		ret = (oldval == cmparg);
630 		break;
631 	case FUTEX_OP_CMP_NE:
632 		ret = (oldval != cmparg);
633 		break;
634 	case FUTEX_OP_CMP_LT:
635 		ret = (oldval < cmparg);
636 		break;
637 	case FUTEX_OP_CMP_GE:
638 		ret = (oldval >= cmparg);
639 		break;
640 	case FUTEX_OP_CMP_LE:
641 		ret = (oldval <= cmparg);
642 		break;
643 	case FUTEX_OP_CMP_GT:
644 		ret = (oldval > cmparg);
645 		break;
646 	default:
647 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp);
648 		ret = -ENOSYS;
649 	}
650 
651 	LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
652 	return (ret);
653 }
654 
655 int
656 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
657 {
658 	int clockrt, nrwake, op_ret, ret;
659 	struct linux_emuldata *em;
660 	struct waiting_proc *wp;
661 	struct futex *f, *f2;
662 	struct l_timespec timeout;
663 	struct timeval utv, ctv;
664 	int timeout_hz;
665 	int error;
666 	uint32_t flags, val;
667 
668 	LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args);
669 
670 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
671 		flags = 0;
672 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
673 	} else
674 		flags = FUTEX_SHARED;
675 
676 	/*
677 	 * Currently support for switching between CLOCK_MONOTONIC and
678 	 * CLOCK_REALTIME is not present. However Linux forbids the use of
679 	 * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and
680 	 * FUTEX_WAIT_REQUEUE_PI.
681 	 */
682 	clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
683 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
684 	if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET &&
685 		args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) {
686 		LIN_SDT_PROBE0(futex, linux_sys_futex,
687 		    unimplemented_clockswitch);
688 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
689 		return (ENOSYS);
690 	}
691 
692 	error = 0;
693 	f = f2 = NULL;
694 
695 	switch (args->op) {
696 	case LINUX_FUTEX_WAIT:
697 		args->val3 = FUTEX_BITSET_MATCH_ANY;
698 		/* FALLTHROUGH */
699 
700 	case LINUX_FUTEX_WAIT_BITSET:
701 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr,
702 		    args->val, args->val3);
703 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
704 		    args->uaddr, args->val, args->val3);
705 
706 		error = futex_get(args->uaddr, &wp, &f,
707 		    flags | FUTEX_CREATE_WP);
708 		if (error) {
709 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
710 			return (error);
711 		}
712 
713 		error = copyin(args->uaddr, &val, sizeof(val));
714 		if (error) {
715 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
716 			    error);
717 			LINUX_CTR1(sys_futex, "WAIT copyin failed %d",
718 			    error);
719 			futex_put(f, wp);
720 
721 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
722 			return (error);
723 		}
724 		if (val != args->val) {
725 			LIN_SDT_PROBE4(futex, linux_sys_futex,
726 			    debug_wait_value_neq, args->uaddr, args->val, val,
727 			    args->val3);
728 			LINUX_CTR3(sys_futex,
729 			    "WAIT uaddr %p val 0x%x != uval 0x%x",
730 			    args->uaddr, args->val, val);
731 			futex_put(f, wp);
732 
733 			LIN_SDT_PROBE1(futex, linux_sys_futex, return,
734 			    EWOULDBLOCK);
735 			return (EWOULDBLOCK);
736 		}
737 
738 		if (args->timeout != NULL) {
739 			error = copyin(args->timeout, &timeout, sizeof(timeout));
740 			if (error) {
741 				LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
742 				    error);
743 				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
744 				futex_put(f, wp);
745 				return (error);
746 			}
747 			TIMESPEC_TO_TIMEVAL(&utv, &timeout);
748 			error = itimerfix(&utv);
749 			if (error) {
750 				LIN_SDT_PROBE1(futex, linux_sys_futex, itimerfix_error,
751 				    error);
752 				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
753 				futex_put(f, wp);
754 				return (error);
755 			}
756 			if (clockrt) {
757 				microtime(&ctv);
758 				timevalsub(&utv, &ctv);
759 			} else if (args->op == LINUX_FUTEX_WAIT_BITSET) {
760 				microuptime(&ctv);
761 				timevalsub(&utv, &ctv);
762 			}
763 			if (utv.tv_sec < 0)
764 				timevalclear(&utv);
765 			timeout_hz = tvtohz(&utv);
766 		} else
767 			timeout_hz = 0;
768 
769 		error = futex_wait(f, wp, timeout_hz, args->val3);
770 		break;
771 
772 	case LINUX_FUTEX_WAKE:
773 		args->val3 = FUTEX_BITSET_MATCH_ANY;
774 		/* FALLTHROUGH */
775 
776 	case LINUX_FUTEX_WAKE_BITSET:
777 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr,
778 		    args->val, args->val3);
779 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
780 		    args->uaddr, args->val, args->val3);
781 
782 		error = futex_get(args->uaddr, NULL, &f,
783 		    flags | FUTEX_DONTCREATE);
784 		if (error) {
785 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
786 			return (error);
787 		}
788 
789 		if (f == NULL) {
790 			td->td_retval[0] = 0;
791 
792 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
793 			return (error);
794 		}
795 		td->td_retval[0] = futex_wake(f, args->val, args->val3);
796 		futex_put(f, NULL);
797 		break;
798 
799 	case LINUX_FUTEX_CMP_REQUEUE:
800 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue,
801 		    args->uaddr, args->val, args->val3, args->uaddr2,
802 		    args->timeout);
803 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
804 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
805 		    args->uaddr, args->val, args->val3, args->uaddr2,
806 		    args->timeout);
807 
808 		/*
809 		 * Linux allows this, we would not, it is an incorrect
810 		 * usage of declared ABI, so return EINVAL.
811 		 */
812 		if (args->uaddr == args->uaddr2) {
813 			LIN_SDT_PROBE0(futex, linux_sys_futex,
814 			    invalid_cmp_requeue_use);
815 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
816 			return (EINVAL);
817 		}
818 
819 		error = futex_get(args->uaddr, NULL, &f, flags);
820 		if (error) {
821 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
822 			return (error);
823 		}
824 
825 		/*
826 		 * To avoid deadlocks return EINVAL if second futex
827 		 * exists at this time.
828 		 *
829 		 * Glibc fall back to FUTEX_WAKE in case of any error
830 		 * returned by FUTEX_CMP_REQUEUE.
831 		 */
832 		error = futex_get(args->uaddr2, NULL, &f2,
833 		    flags | FUTEX_DONTEXISTS);
834 		if (error) {
835 			futex_put(f, NULL);
836 
837 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
838 			return (error);
839 		}
840 		error = copyin(args->uaddr, &val, sizeof(val));
841 		if (error) {
842 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
843 			    error);
844 			LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d",
845 			    error);
846 			futex_put(f2, NULL);
847 			futex_put(f, NULL);
848 
849 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
850 			return (error);
851 		}
852 		if (val != args->val3) {
853 			LIN_SDT_PROBE2(futex, linux_sys_futex,
854 			    debug_cmp_requeue_value_neq, args->val, val);
855 			LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x",
856 			    args->val, val);
857 			futex_put(f2, NULL);
858 			futex_put(f, NULL);
859 
860 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN);
861 			return (EAGAIN);
862 		}
863 
864 		nrwake = (int)(unsigned long)args->timeout;
865 		td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake);
866 		futex_put(f2, NULL);
867 		futex_put(f, NULL);
868 		break;
869 
870 	case LINUX_FUTEX_WAKE_OP:
871 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op,
872 		    args->uaddr, args->op, args->val, args->uaddr2, args->val3);
873 		LINUX_CTR5(sys_futex, "WAKE_OP "
874 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
875 		    args->uaddr, args->val, args->uaddr2, args->val3,
876 		    args->timeout);
877 
878 		error = futex_get(args->uaddr, NULL, &f, flags);
879 		if (error) {
880 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
881 			return (error);
882 		}
883 
884 		if (args->uaddr != args->uaddr2)
885 			error = futex_get(args->uaddr2, NULL, &f2, flags);
886 		if (error) {
887 			futex_put(f, NULL);
888 
889 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
890 			return (error);
891 		}
892 
893 		/*
894 		 * This function returns positive number as results and
895 		 * negative as errors
896 		 */
897 		op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
898 
899 		LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x",
900 		    args->uaddr, op_ret);
901 
902 		if (op_ret < 0) {
903 			/* XXX: We don't handle the EFAULT yet. */
904 			if (op_ret != -EFAULT) {
905 				if (f2 != NULL)
906 					futex_put(f2, NULL);
907 				futex_put(f, NULL);
908 
909 				LIN_SDT_PROBE1(futex, linux_sys_futex, return,
910 				    -op_ret);
911 				return (-op_ret);
912 			} else {
913 				LIN_SDT_PROBE0(futex, linux_sys_futex,
914 				    unhandled_efault);
915 			}
916 			if (f2 != NULL)
917 				futex_put(f2, NULL);
918 			futex_put(f, NULL);
919 
920 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EFAULT);
921 			return (EFAULT);
922 		}
923 
924 		ret = futex_wake(f, args->val, args->val3);
925 
926 		if (op_ret > 0) {
927 			op_ret = 0;
928 			nrwake = (int)(unsigned long)args->timeout;
929 
930 			if (f2 != NULL)
931 				op_ret += futex_wake(f2, nrwake, args->val3);
932 			else
933 				op_ret += futex_wake(f, nrwake, args->val3);
934 			ret += op_ret;
935 
936 		}
937 		if (f2 != NULL)
938 			futex_put(f2, NULL);
939 		futex_put(f, NULL);
940 		td->td_retval[0] = ret;
941 		break;
942 
943 	case LINUX_FUTEX_LOCK_PI:
944 		/* not yet implemented */
945 		linux_msg(td,
946 			  "linux_sys_futex: "
947 			  "op LINUX_FUTEX_LOCK_PI not implemented\n");
948 		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi);
949 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
950 		return (ENOSYS);
951 
952 	case LINUX_FUTEX_UNLOCK_PI:
953 		/* not yet implemented */
954 		linux_msg(td,
955 			  "linux_sys_futex: "
956 			  "op LINUX_FUTEX_UNLOCK_PI not implemented\n");
957 		LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi);
958 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
959 		return (ENOSYS);
960 
961 	case LINUX_FUTEX_TRYLOCK_PI:
962 		/* not yet implemented */
963 		linux_msg(td,
964 			  "linux_sys_futex: "
965 			  "op LINUX_FUTEX_TRYLOCK_PI not implemented\n");
966 		LIN_SDT_PROBE0(futex, linux_sys_futex,
967 		    unimplemented_trylock_pi);
968 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
969 		return (ENOSYS);
970 
971 	case LINUX_FUTEX_REQUEUE:
972 
973 		/*
974 		 * Glibc does not use this operation since version 2.3.3,
975 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
976 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
977 		 * FUTEX_REQUEUE returned EINVAL.
978 		 */
979 		em = em_find(td->td_proc, EMUL_DONTLOCK);
980 		if ((em->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
981 			linux_msg(td,
982 				  "linux_sys_futex: "
983 				  "unsupported futex_requeue op\n");
984 			em->flags |= LINUX_XDEPR_REQUEUEOP;
985 			LIN_SDT_PROBE0(futex, linux_sys_futex,
986 			    deprecated_requeue);
987 		}
988 
989 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
990 		return (EINVAL);
991 
992 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
993 		/* not yet implemented */
994 		linux_msg(td,
995 			  "linux_sys_futex: "
996 			  "op FUTEX_WAIT_REQUEUE_PI not implemented\n");
997 		LIN_SDT_PROBE0(futex, linux_sys_futex,
998 		    unimplemented_wait_requeue_pi);
999 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1000 		return (ENOSYS);
1001 
1002 	case LINUX_FUTEX_CMP_REQUEUE_PI:
1003 		/* not yet implemented */
1004 		linux_msg(td,
1005 			    "linux_sys_futex: "
1006 			    "op LINUX_FUTEX_CMP_REQUEUE_PI not implemented\n");
1007 		LIN_SDT_PROBE0(futex, linux_sys_futex,
1008 		    unimplemented_cmp_requeue_pi);
1009 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1010 		return (ENOSYS);
1011 
1012 	default:
1013 		linux_msg(td,
1014 			  "linux_sys_futex: unknown op %d\n", args->op);
1015 		LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation,
1016 		    args->op);
1017 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
1018 		return (ENOSYS);
1019 	}
1020 
1021 	LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
1022 	return (error);
1023 }
1024 
1025 int
1026 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
1027 {
1028 	struct linux_emuldata *em;
1029 
1030 	LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args);
1031 
1032 	if (args->len != sizeof(struct linux_robust_list_head)) {
1033 		LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error);
1034 		LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL);
1035 		return (EINVAL);
1036 	}
1037 
1038 	em = em_find(td->td_proc, EMUL_DOLOCK);
1039 	em->robust_futexes = args->head;
1040 	EMUL_UNLOCK(&emul_lock);
1041 
1042 	LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0);
1043 	return (0);
1044 }
1045 
1046 int
1047 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
1048 {
1049 	struct linux_emuldata *em;
1050 	struct linux_robust_list_head *head;
1051 	l_size_t len = sizeof(struct linux_robust_list_head);
1052 	int error = 0;
1053 
1054 	LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args);
1055 
1056 	if (!args->pid) {
1057 		em = em_find(td->td_proc, EMUL_DONTLOCK);
1058 		head = em->robust_futexes;
1059 	} else {
1060 		struct proc *p;
1061 
1062 		p = pfind(args->pid);
1063 		if (p == NULL) {
1064 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
1065 			    ESRCH);
1066 			return (ESRCH);
1067 		}
1068 
1069 		em = em_find(p, EMUL_DONTLOCK);
1070 		/* XXX: ptrace? */
1071 		if (priv_check(td, PRIV_CRED_SETUID) ||
1072 		    priv_check(td, PRIV_CRED_SETEUID) ||
1073 		    p_candebug(td, p)) {
1074 			PROC_UNLOCK(p);
1075 
1076 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
1077 			    EPERM);
1078 			return (EPERM);
1079 		}
1080 		head = em->robust_futexes;
1081 
1082 		PROC_UNLOCK(p);
1083 	}
1084 
1085 	error = copyout(&len, args->len, sizeof(l_size_t));
1086 	if (error) {
1087 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
1088 		    error);
1089 		LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT);
1090 		return (EFAULT);
1091 	}
1092 
1093 	error = copyout(head, args->head, sizeof(struct linux_robust_list_head));
1094 	if (error) {
1095 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
1096 		    error);
1097 	}
1098 
1099 	LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error);
1100 	return (error);
1101 }
1102 
1103 static int
1104 handle_futex_death(struct proc *p, uint32_t *uaddr, unsigned int pi)
1105 {
1106 	uint32_t uval, nval, mval;
1107 	struct futex *f;
1108 	int error;
1109 
1110 	LIN_SDT_PROBE3(futex, handle_futex_death, entry, p, uaddr, pi);
1111 
1112 retry:
1113 	error = copyin(uaddr, &uval, 4);
1114 	if (error) {
1115 		LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error);
1116 		LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT);
1117 		return (EFAULT);
1118 	}
1119 	if ((uval & FUTEX_TID_MASK) == p->p_pid) {
1120 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1121 		nval = casuword32(uaddr, uval, mval);
1122 
1123 		if (nval == -1) {
1124 			LIN_SDT_PROBE1(futex, handle_futex_death, return,
1125 			    EFAULT);
1126 			return (EFAULT);
1127 		}
1128 
1129 		if (nval != uval)
1130 			goto retry;
1131 
1132 		if (!pi && (uval & FUTEX_WAITERS)) {
1133 			error = futex_get(uaddr, NULL, &f,
1134 			    FUTEX_DONTCREATE | FUTEX_SHARED);
1135 			if (error) {
1136 				LIN_SDT_PROBE1(futex, handle_futex_death,
1137 				    return, error);
1138 				return (error);
1139 			}
1140 			if (f != NULL) {
1141 				futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY);
1142 				futex_put(f, NULL);
1143 			}
1144 		}
1145 	}
1146 
1147 	LIN_SDT_PROBE1(futex, handle_futex_death, return, 0);
1148 	return (0);
1149 }
1150 
1151 static int
1152 fetch_robust_entry(struct linux_robust_list **entry,
1153     struct linux_robust_list **head, unsigned int *pi)
1154 {
1155 	l_ulong uentry;
1156 	int error;
1157 
1158 	LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi);
1159 
1160 	error = copyin((const void *)head, &uentry, sizeof(l_ulong));
1161 	if (error) {
1162 		LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error);
1163 		LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT);
1164 		return (EFAULT);
1165 	}
1166 
1167 	*entry = (void *)(uentry & ~1UL);
1168 	*pi = uentry & 1;
1169 
1170 	LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0);
1171 	return (0);
1172 }
1173 
1174 /* This walks the list of robust futexes releasing them. */
1175 void
1176 release_futexes(struct proc *p)
1177 {
1178 	struct linux_robust_list_head *head = NULL;
1179 	struct linux_robust_list *entry, *next_entry, *pending;
1180 	unsigned int limit = 2048, pi, next_pi, pip;
1181 	struct linux_emuldata *em;
1182 	l_long futex_offset;
1183 	int rc, error;
1184 
1185 	LIN_SDT_PROBE1(futex, release_futexes, entry, p);
1186 
1187 	em = em_find(p, EMUL_DONTLOCK);
1188 	head = em->robust_futexes;
1189 
1190 	if (head == NULL) {
1191 		LIN_SDT_PROBE0(futex, release_futexes, return);
1192 		return;
1193 	}
1194 
1195 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) {
1196 		LIN_SDT_PROBE0(futex, release_futexes, return);
1197 		return;
1198 	}
1199 
1200 	error = copyin(&head->futex_offset, &futex_offset,
1201 	    sizeof(futex_offset));
1202 	if (error) {
1203 		LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error);
1204 		LIN_SDT_PROBE0(futex, release_futexes, return);
1205 		return;
1206 	}
1207 
1208 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) {
1209 		LIN_SDT_PROBE0(futex, release_futexes, return);
1210 		return;
1211 	}
1212 
1213 	while (entry != &head->list) {
1214 		rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi);
1215 
1216 		if (entry != pending)
1217 			if (handle_futex_death(p,
1218 			    (uint32_t *)((caddr_t)entry + futex_offset), pi)) {
1219 				LIN_SDT_PROBE0(futex, release_futexes, return);
1220 				return;
1221 			}
1222 		if (rc) {
1223 			LIN_SDT_PROBE0(futex, release_futexes, return);
1224 			return;
1225 		}
1226 
1227 		entry = next_entry;
1228 		pi = next_pi;
1229 
1230 		if (!--limit)
1231 			break;
1232 
1233 		sched_relinquish(curthread);
1234 	}
1235 
1236 	if (pending)
1237 		handle_futex_death(p, (uint32_t *)((caddr_t)pending + futex_offset), pip);
1238 
1239 	LIN_SDT_PROBE0(futex, release_futexes, return);
1240 }
1241