xref: /linux/kernel/futex/syscalls.c (revision 2a52ca7c98960aafb0eca9ef96b2d0c932171357)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/syscalls.h>
4 #include <linux/time_namespace.h>
5 
6 #include "futex.h"
7 
8 /*
9  * Support for robust futexes: the kernel cleans up held futexes at
10  * thread exit time.
11  *
12  * Implementation: user-space maintains a per-thread list of locks it
13  * is holding. Upon do_exit(), the kernel carefully walks this list,
14  * and marks all locks that are owned by this thread with the
15  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
16  * always manipulated with the lock held, so the list is private and
17  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
18  * field, to allow the kernel to clean up if the thread dies after
19  * acquiring the lock, but just before it could have added itself to
20  * the list. There can only be one such pending lock.
21  */
22 
23 /**
24  * sys_set_robust_list() - Set the robust-futex list head of a task
25  * @head:	pointer to the list-head
26  * @len:	length of the list-head, as userspace expects
27  */
28 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
29 		size_t, len)
30 {
31 	/*
32 	 * The kernel knows only one size for now:
33 	 */
34 	if (unlikely(len != sizeof(*head)))
35 		return -EINVAL;
36 
37 	current->robust_list = head;
38 
39 	return 0;
40 }
41 
42 /**
43  * sys_get_robust_list() - Get the robust-futex list head of a task
44  * @pid:	pid of the process [zero for current task]
45  * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
46  * @len_ptr:	pointer to a length field, the kernel fills in the header size
47  */
48 SYSCALL_DEFINE3(get_robust_list, int, pid,
49 		struct robust_list_head __user * __user *, head_ptr,
50 		size_t __user *, len_ptr)
51 {
52 	struct robust_list_head __user *head;
53 	unsigned long ret;
54 	struct task_struct *p;
55 
56 	rcu_read_lock();
57 
58 	ret = -ESRCH;
59 	if (!pid)
60 		p = current;
61 	else {
62 		p = find_task_by_vpid(pid);
63 		if (!p)
64 			goto err_unlock;
65 	}
66 
67 	ret = -EPERM;
68 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
69 		goto err_unlock;
70 
71 	head = p->robust_list;
72 	rcu_read_unlock();
73 
74 	if (put_user(sizeof(*head), len_ptr))
75 		return -EFAULT;
76 	return put_user(head, head_ptr);
77 
78 err_unlock:
79 	rcu_read_unlock();
80 
81 	return ret;
82 }
83 
84 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
85 		u32 __user *uaddr2, u32 val2, u32 val3)
86 {
87 	unsigned int flags = futex_to_flags(op);
88 	int cmd = op & FUTEX_CMD_MASK;
89 
90 	if (flags & FLAGS_CLOCKRT) {
91 		if (cmd != FUTEX_WAIT_BITSET &&
92 		    cmd != FUTEX_WAIT_REQUEUE_PI &&
93 		    cmd != FUTEX_LOCK_PI2)
94 			return -ENOSYS;
95 	}
96 
97 	switch (cmd) {
98 	case FUTEX_WAIT:
99 		val3 = FUTEX_BITSET_MATCH_ANY;
100 		fallthrough;
101 	case FUTEX_WAIT_BITSET:
102 		return futex_wait(uaddr, flags, val, timeout, val3);
103 	case FUTEX_WAKE:
104 		val3 = FUTEX_BITSET_MATCH_ANY;
105 		fallthrough;
106 	case FUTEX_WAKE_BITSET:
107 		return futex_wake(uaddr, flags, val, val3);
108 	case FUTEX_REQUEUE:
109 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
110 	case FUTEX_CMP_REQUEUE:
111 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
112 	case FUTEX_WAKE_OP:
113 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
114 	case FUTEX_LOCK_PI:
115 		flags |= FLAGS_CLOCKRT;
116 		fallthrough;
117 	case FUTEX_LOCK_PI2:
118 		return futex_lock_pi(uaddr, flags, timeout, 0);
119 	case FUTEX_UNLOCK_PI:
120 		return futex_unlock_pi(uaddr, flags);
121 	case FUTEX_TRYLOCK_PI:
122 		return futex_lock_pi(uaddr, flags, NULL, 1);
123 	case FUTEX_WAIT_REQUEUE_PI:
124 		val3 = FUTEX_BITSET_MATCH_ANY;
125 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
126 					     uaddr2);
127 	case FUTEX_CMP_REQUEUE_PI:
128 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
129 	}
130 	return -ENOSYS;
131 }
132 
133 static __always_inline bool futex_cmd_has_timeout(u32 cmd)
134 {
135 	switch (cmd) {
136 	case FUTEX_WAIT:
137 	case FUTEX_LOCK_PI:
138 	case FUTEX_LOCK_PI2:
139 	case FUTEX_WAIT_BITSET:
140 	case FUTEX_WAIT_REQUEUE_PI:
141 		return true;
142 	}
143 	return false;
144 }
145 
146 static __always_inline int
147 futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
148 {
149 	if (!timespec64_valid(ts))
150 		return -EINVAL;
151 
152 	*t = timespec64_to_ktime(*ts);
153 	if (cmd == FUTEX_WAIT)
154 		*t = ktime_add_safe(ktime_get(), *t);
155 	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
156 		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
157 	return 0;
158 }
159 
160 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
161 		const struct __kernel_timespec __user *, utime,
162 		u32 __user *, uaddr2, u32, val3)
163 {
164 	int ret, cmd = op & FUTEX_CMD_MASK;
165 	ktime_t t, *tp = NULL;
166 	struct timespec64 ts;
167 
168 	if (utime && futex_cmd_has_timeout(cmd)) {
169 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
170 			return -EFAULT;
171 		if (get_timespec64(&ts, utime))
172 			return -EFAULT;
173 		ret = futex_init_timeout(cmd, op, &ts, &t);
174 		if (ret)
175 			return ret;
176 		tp = &t;
177 	}
178 
179 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
180 }
181 
182 /**
183  * futex_parse_waitv - Parse a waitv array from userspace
184  * @futexv:	Kernel side list of waiters to be filled
185  * @uwaitv:     Userspace list to be parsed
186  * @nr_futexes: Length of futexv
187  * @wake:	Wake to call when futex is woken
188  * @wake_data:	Data for the wake handler
189  *
190  * Return: Error code on failure, 0 on success
191  */
192 int futex_parse_waitv(struct futex_vector *futexv,
193 		      struct futex_waitv __user *uwaitv,
194 		      unsigned int nr_futexes, futex_wake_fn *wake,
195 		      void *wake_data)
196 {
197 	struct futex_waitv aux;
198 	unsigned int i;
199 
200 	for (i = 0; i < nr_futexes; i++) {
201 		unsigned int flags;
202 
203 		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
204 			return -EFAULT;
205 
206 		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
207 			return -EINVAL;
208 
209 		flags = futex2_to_flags(aux.flags);
210 		if (!futex_flags_valid(flags))
211 			return -EINVAL;
212 
213 		if (!futex_validate_input(flags, aux.val))
214 			return -EINVAL;
215 
216 		futexv[i].w.flags = flags;
217 		futexv[i].w.val = aux.val;
218 		futexv[i].w.uaddr = aux.uaddr;
219 		futexv[i].q = futex_q_init;
220 		futexv[i].q.wake = wake;
221 		futexv[i].q.wake_data = wake_data;
222 	}
223 
224 	return 0;
225 }
226 
227 static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
228 				clockid_t clockid, struct hrtimer_sleeper *to)
229 {
230 	int flag_clkid = 0, flag_init = 0;
231 	struct timespec64 ts;
232 	ktime_t time;
233 	int ret;
234 
235 	if (!timeout)
236 		return 0;
237 
238 	if (clockid == CLOCK_REALTIME) {
239 		flag_clkid = FLAGS_CLOCKRT;
240 		flag_init = FUTEX_CLOCK_REALTIME;
241 	}
242 
243 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
244 		return -EINVAL;
245 
246 	if (get_timespec64(&ts, timeout))
247 		return -EFAULT;
248 
249 	/*
250 	 * Since there's no opcode for futex_waitv, use
251 	 * FUTEX_WAIT_BITSET that uses absolute timeout as well
252 	 */
253 	ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
254 	if (ret)
255 		return ret;
256 
257 	futex_setup_timer(&time, to, flag_clkid, 0);
258 	return 0;
259 }
260 
261 static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
262 {
263 	hrtimer_cancel(&to->timer);
264 	destroy_hrtimer_on_stack(&to->timer);
265 }
266 
267 /**
268  * sys_futex_waitv - Wait on a list of futexes
269  * @waiters:    List of futexes to wait on
270  * @nr_futexes: Length of futexv
271  * @flags:      Flag for timeout (monotonic/realtime)
272  * @timeout:	Optional absolute timeout.
273  * @clockid:	Clock to be used for the timeout, realtime or monotonic.
274  *
275  * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
276  * if a futex_wake() is performed at any uaddr. The syscall returns immediately
277  * if any waiter has *uaddr != val. *timeout is an optional timeout value for
278  * the operation. Each waiter has individual flags. The `flags` argument for
279  * the syscall should be used solely for specifying the timeout as realtime, if
280  * needed. Flags for private futexes, sizes, etc. should be used on the
281  * individual flags of each waiter.
282  *
283  * Returns the array index of one of the woken futexes. No further information
284  * is provided: any number of other futexes may also have been woken by the
285  * same event, and if more than one futex was woken, the retrned index may
286  * refer to any one of them. (It is not necessaryily the futex with the
287  * smallest index, nor the one most recently woken, nor...)
288  */
289 
290 SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
291 		unsigned int, nr_futexes, unsigned int, flags,
292 		struct __kernel_timespec __user *, timeout, clockid_t, clockid)
293 {
294 	struct hrtimer_sleeper to;
295 	struct futex_vector *futexv;
296 	int ret;
297 
298 	/* This syscall supports no flags for now */
299 	if (flags)
300 		return -EINVAL;
301 
302 	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
303 		return -EINVAL;
304 
305 	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
306 		return ret;
307 
308 	futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
309 	if (!futexv) {
310 		ret = -ENOMEM;
311 		goto destroy_timer;
312 	}
313 
314 	ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
315 				NULL);
316 	if (!ret)
317 		ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
318 
319 	kfree(futexv);
320 
321 destroy_timer:
322 	if (timeout)
323 		futex2_destroy_timeout(&to);
324 	return ret;
325 }
326 
327 /*
328  * sys_futex_wake - Wake a number of futexes
329  * @uaddr:	Address of the futex(es) to wake
330  * @mask:	bitmask
331  * @nr:		Number of the futexes to wake
332  * @flags:	FUTEX2 flags
333  *
334  * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
335  * futex2 family of calls.
336  */
337 
338 SYSCALL_DEFINE4(futex_wake,
339 		void __user *, uaddr,
340 		unsigned long, mask,
341 		int, nr,
342 		unsigned int, flags)
343 {
344 	if (flags & ~FUTEX2_VALID_MASK)
345 		return -EINVAL;
346 
347 	flags = futex2_to_flags(flags);
348 	if (!futex_flags_valid(flags))
349 		return -EINVAL;
350 
351 	if (!futex_validate_input(flags, mask))
352 		return -EINVAL;
353 
354 	return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
355 }
356 
357 /*
358  * sys_futex_wait - Wait on a futex
359  * @uaddr:	Address of the futex to wait on
360  * @val:	Value of @uaddr
361  * @mask:	bitmask
362  * @flags:	FUTEX2 flags
363  * @timeout:	Optional absolute timeout
364  * @clockid:	Clock to be used for the timeout, realtime or monotonic
365  *
366  * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
367  * futex2 familiy of calls.
368  */
369 
370 SYSCALL_DEFINE6(futex_wait,
371 		void __user *, uaddr,
372 		unsigned long, val,
373 		unsigned long, mask,
374 		unsigned int, flags,
375 		struct __kernel_timespec __user *, timeout,
376 		clockid_t, clockid)
377 {
378 	struct hrtimer_sleeper to;
379 	int ret;
380 
381 	if (flags & ~FUTEX2_VALID_MASK)
382 		return -EINVAL;
383 
384 	flags = futex2_to_flags(flags);
385 	if (!futex_flags_valid(flags))
386 		return -EINVAL;
387 
388 	if (!futex_validate_input(flags, val) ||
389 	    !futex_validate_input(flags, mask))
390 		return -EINVAL;
391 
392 	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
393 		return ret;
394 
395 	ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
396 
397 	if (timeout)
398 		futex2_destroy_timeout(&to);
399 
400 	return ret;
401 }
402 
403 /*
404  * sys_futex_requeue - Requeue a waiter from one futex to another
405  * @waiters:	array describing the source and destination futex
406  * @flags:	unused
407  * @nr_wake:	number of futexes to wake
408  * @nr_requeue:	number of futexes to requeue
409  *
410  * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
411  * futex2 family of calls.
412  */
413 
414 SYSCALL_DEFINE4(futex_requeue,
415 		struct futex_waitv __user *, waiters,
416 		unsigned int, flags,
417 		int, nr_wake,
418 		int, nr_requeue)
419 {
420 	struct futex_vector futexes[2];
421 	u32 cmpval;
422 	int ret;
423 
424 	if (flags)
425 		return -EINVAL;
426 
427 	if (!waiters)
428 		return -EINVAL;
429 
430 	ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
431 	if (ret)
432 		return ret;
433 
434 	cmpval = futexes[0].w.val;
435 
436 	return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
437 			     u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
438 			     nr_wake, nr_requeue, &cmpval, 0);
439 }
440 
441 #ifdef CONFIG_COMPAT
442 COMPAT_SYSCALL_DEFINE2(set_robust_list,
443 		struct compat_robust_list_head __user *, head,
444 		compat_size_t, len)
445 {
446 	if (unlikely(len != sizeof(*head)))
447 		return -EINVAL;
448 
449 	current->compat_robust_list = head;
450 
451 	return 0;
452 }
453 
454 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
455 			compat_uptr_t __user *, head_ptr,
456 			compat_size_t __user *, len_ptr)
457 {
458 	struct compat_robust_list_head __user *head;
459 	unsigned long ret;
460 	struct task_struct *p;
461 
462 	rcu_read_lock();
463 
464 	ret = -ESRCH;
465 	if (!pid)
466 		p = current;
467 	else {
468 		p = find_task_by_vpid(pid);
469 		if (!p)
470 			goto err_unlock;
471 	}
472 
473 	ret = -EPERM;
474 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
475 		goto err_unlock;
476 
477 	head = p->compat_robust_list;
478 	rcu_read_unlock();
479 
480 	if (put_user(sizeof(*head), len_ptr))
481 		return -EFAULT;
482 	return put_user(ptr_to_compat(head), head_ptr);
483 
484 err_unlock:
485 	rcu_read_unlock();
486 
487 	return ret;
488 }
489 #endif /* CONFIG_COMPAT */
490 
491 #ifdef CONFIG_COMPAT_32BIT_TIME
492 SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
493 		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
494 		u32, val3)
495 {
496 	int ret, cmd = op & FUTEX_CMD_MASK;
497 	ktime_t t, *tp = NULL;
498 	struct timespec64 ts;
499 
500 	if (utime && futex_cmd_has_timeout(cmd)) {
501 		if (get_old_timespec32(&ts, utime))
502 			return -EFAULT;
503 		ret = futex_init_timeout(cmd, op, &ts, &t);
504 		if (ret)
505 			return ret;
506 		tp = &t;
507 	}
508 
509 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
510 }
511 #endif /* CONFIG_COMPAT_32BIT_TIME */
512 
513