xref: /linux/kernel/futex/syscalls.c (revision 3ca9595d9fb6cce6633a5b03d98c2aecb5499838)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/syscalls.h>
4 #include <linux/time_namespace.h>
5 
6 #include "futex.h"
7 
8 /*
9  * Support for robust futexes: the kernel cleans up held futexes at
10  * thread exit time.
11  *
12  * Implementation: user-space maintains a per-thread list of locks it
13  * is holding. Upon do_exit(), the kernel carefully walks this list,
14  * and marks all locks that are owned by this thread with the
15  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
16  * always manipulated with the lock held, so the list is private and
17  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
18  * field, to allow the kernel to clean up if the thread dies after
19  * acquiring the lock, but just before it could have added itself to
20  * the list. There can only be one such pending lock.
21  */
22 
23 /**
24  * sys_set_robust_list() - Set the robust-futex list head of a task
25  * @head:	pointer to the list-head
26  * @len:	length of the list-head, as userspace expects
27  */
28 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len)
29 {
30 	/* The kernel knows only one size for now. */
31 	if (unlikely(len != sizeof(*head)))
32 		return -EINVAL;
33 
34 	current->futex.robust_list = head;
35 	return 0;
36 }
37 
38 static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
39 {
40 #ifdef CONFIG_COMPAT
41 	if (compat)
42 		return p->futex.compat_robust_list;
43 #endif
44 	return p->futex.robust_list;
45 }
46 
47 static void __user *futex_get_robust_list_common(int pid, bool compat)
48 {
49 	struct task_struct *p = current;
50 	void __user *head;
51 	int ret;
52 
53 	scoped_guard(rcu) {
54 		if (pid) {
55 			p = find_task_by_vpid(pid);
56 			if (!p)
57 				return (void __user *)ERR_PTR(-ESRCH);
58 		}
59 		get_task_struct(p);
60 	}
61 
62 	/*
63 	 * Hold exec_update_lock to serialize with concurrent exec()
64 	 * so ptrace_may_access() is checked against stable credentials
65 	 */
66 	ret = down_read_killable(&p->signal->exec_update_lock);
67 	if (ret)
68 		goto err_put;
69 
70 	ret = -EPERM;
71 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
72 		goto err_unlock;
73 
74 	head = futex_task_robust_list(p, compat);
75 
76 	up_read(&p->signal->exec_update_lock);
77 	put_task_struct(p);
78 
79 	return head;
80 
81 err_unlock:
82 	up_read(&p->signal->exec_update_lock);
83 err_put:
84 	put_task_struct(p);
85 	return (void __user *)ERR_PTR(ret);
86 }
87 
88 /**
89  * sys_get_robust_list() - Get the robust-futex list head of a task
90  * @pid:	pid of the process [zero for current task]
91  * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
92  * @len_ptr:	pointer to a length field, the kernel fills in the header size
93  */
94 SYSCALL_DEFINE3(get_robust_list, int, pid,
95 		struct robust_list_head __user * __user *, head_ptr,
96 		size_t __user *, len_ptr)
97 {
98 	struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
99 
100 	if (IS_ERR(head))
101 		return PTR_ERR(head);
102 
103 	if (put_user(sizeof(*head), len_ptr))
104 		return -EFAULT;
105 	return put_user(head, head_ptr);
106 }
107 
108 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
109 		u32 __user *uaddr2, u32 val2, u32 val3)
110 {
111 	unsigned int flags = futex_to_flags(op);
112 	int cmd = op & FUTEX_CMD_MASK;
113 
114 	if (flags & FLAGS_CLOCKRT) {
115 		if (cmd != FUTEX_WAIT_BITSET &&
116 		    cmd != FUTEX_WAIT_REQUEUE_PI &&
117 		    cmd != FUTEX_LOCK_PI2)
118 			return -ENOSYS;
119 	}
120 
121 	if (flags & FLAGS_ROBUST_UNLOCK) {
122 		if (cmd != FUTEX_WAKE &&
123 		    cmd != FUTEX_WAKE_BITSET &&
124 		    cmd != FUTEX_UNLOCK_PI)
125 			return -ENOSYS;
126 	}
127 
128 	switch (cmd) {
129 	case FUTEX_WAIT:
130 		val3 = FUTEX_BITSET_MATCH_ANY;
131 		fallthrough;
132 	case FUTEX_WAIT_BITSET:
133 		return futex_wait(uaddr, flags, val, timeout, val3);
134 	case FUTEX_WAKE:
135 		val3 = FUTEX_BITSET_MATCH_ANY;
136 		fallthrough;
137 	case FUTEX_WAKE_BITSET:
138 		return futex_wake(uaddr, flags, uaddr2, val, val3);
139 	case FUTEX_REQUEUE:
140 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
141 	case FUTEX_CMP_REQUEUE:
142 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
143 	case FUTEX_WAKE_OP:
144 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
145 	case FUTEX_LOCK_PI:
146 		flags |= FLAGS_CLOCKRT;
147 		fallthrough;
148 	case FUTEX_LOCK_PI2:
149 		return futex_lock_pi(uaddr, flags, timeout, 0);
150 	case FUTEX_UNLOCK_PI:
151 		return futex_unlock_pi(uaddr, flags, uaddr2);
152 	case FUTEX_TRYLOCK_PI:
153 		return futex_lock_pi(uaddr, flags, NULL, 1);
154 	case FUTEX_WAIT_REQUEUE_PI:
155 		val3 = FUTEX_BITSET_MATCH_ANY;
156 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
157 					     uaddr2);
158 	case FUTEX_CMP_REQUEUE_PI:
159 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
160 	}
161 	return -ENOSYS;
162 }
163 
164 static __always_inline bool futex_cmd_has_timeout(u32 cmd)
165 {
166 	switch (cmd) {
167 	case FUTEX_WAIT:
168 	case FUTEX_LOCK_PI:
169 	case FUTEX_LOCK_PI2:
170 	case FUTEX_WAIT_BITSET:
171 	case FUTEX_WAIT_REQUEUE_PI:
172 		return true;
173 	}
174 	return false;
175 }
176 
177 static __always_inline int
178 futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
179 {
180 	if (!timespec64_valid(ts))
181 		return -EINVAL;
182 
183 	*t = timespec64_to_ktime(*ts);
184 	if (cmd == FUTEX_WAIT)
185 		*t = ktime_add_safe(ktime_get(), *t);
186 	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
187 		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
188 	return 0;
189 }
190 
191 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
192 		const struct __kernel_timespec __user *, utime,
193 		u32 __user *, uaddr2, u32, val3)
194 {
195 	int ret, cmd = op & FUTEX_CMD_MASK;
196 	ktime_t t, *tp = NULL;
197 	struct timespec64 ts;
198 
199 	if (utime && futex_cmd_has_timeout(cmd)) {
200 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
201 			return -EFAULT;
202 		if (get_timespec64(&ts, utime))
203 			return -EFAULT;
204 		ret = futex_init_timeout(cmd, op, &ts, &t);
205 		if (ret)
206 			return ret;
207 		tp = &t;
208 	}
209 
210 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
211 }
212 
213 /**
214  * futex_parse_waitv - Parse a waitv array from userspace
215  * @futexv:	Kernel side list of waiters to be filled
216  * @uwaitv:     Userspace list to be parsed
217  * @nr_futexes: Length of futexv
218  * @wake:	Wake to call when futex is woken
219  * @wake_data:	Data for the wake handler
220  *
221  * Return: Error code on failure, 0 on success
222  */
223 int futex_parse_waitv(struct futex_vector *futexv,
224 		      struct futex_waitv __user *uwaitv,
225 		      unsigned int nr_futexes, futex_wake_fn *wake,
226 		      void *wake_data)
227 {
228 	struct futex_waitv aux;
229 	unsigned int i;
230 
231 	for (i = 0; i < nr_futexes; i++) {
232 		unsigned int flags;
233 
234 		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
235 			return -EFAULT;
236 
237 		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
238 			return -EINVAL;
239 
240 		flags = futex2_to_flags(aux.flags);
241 		if (!futex_flags_valid(flags))
242 			return -EINVAL;
243 
244 		if (!futex_validate_input(flags, aux.val))
245 			return -EINVAL;
246 
247 		futexv[i].w.flags = flags;
248 		futexv[i].w.val = aux.val;
249 		futexv[i].w.uaddr = aux.uaddr;
250 		futexv[i].q = futex_q_init;
251 		futexv[i].q.wake = wake;
252 		futexv[i].q.wake_data = wake_data;
253 	}
254 
255 	return 0;
256 }
257 
258 static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
259 				clockid_t clockid, struct hrtimer_sleeper *to)
260 {
261 	int flag_clkid = 0, flag_init = 0;
262 	struct timespec64 ts;
263 	ktime_t time;
264 	int ret;
265 
266 	if (!timeout)
267 		return 0;
268 
269 	if (clockid == CLOCK_REALTIME) {
270 		flag_clkid = FLAGS_CLOCKRT;
271 		flag_init = FUTEX_CLOCK_REALTIME;
272 	}
273 
274 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
275 		return -EINVAL;
276 
277 	if (get_timespec64(&ts, timeout))
278 		return -EFAULT;
279 
280 	/*
281 	 * Since there's no opcode for futex_waitv, use
282 	 * FUTEX_WAIT_BITSET that uses absolute timeout as well
283 	 */
284 	ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
285 	if (ret)
286 		return ret;
287 
288 	futex_setup_timer(&time, to, flag_clkid, 0);
289 	return 0;
290 }
291 
292 static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
293 {
294 	hrtimer_cancel(&to->timer);
295 	destroy_hrtimer_on_stack(&to->timer);
296 }
297 
298 /**
299  * sys_futex_waitv - Wait on a list of futexes
300  * @waiters:    List of futexes to wait on
301  * @nr_futexes: Length of futexv
302  * @flags:      Flag for timeout (monotonic/realtime)
303  * @timeout:	Optional absolute timeout.
304  * @clockid:	Clock to be used for the timeout, realtime or monotonic.
305  *
306  * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
307  * if a futex_wake() is performed at any uaddr. The syscall returns immediately
308  * if any waiter has *uaddr != val. *timeout is an optional timeout value for
309  * the operation. Each waiter has individual flags. The `flags` argument for
310  * the syscall should be used solely for specifying the timeout as realtime, if
311  * needed. Flags for private futexes, sizes, etc. should be used on the
312  * individual flags of each waiter.
313  *
314  * Returns the array index of one of the woken futexes. No further information
315  * is provided: any number of other futexes may also have been woken by the
316  * same event, and if more than one futex was woken, the retrned index may
317  * refer to any one of them. (It is not necessaryily the futex with the
318  * smallest index, nor the one most recently woken, nor...)
319  */
320 
321 SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
322 		unsigned int, nr_futexes, unsigned int, flags,
323 		struct __kernel_timespec __user *, timeout, clockid_t, clockid)
324 {
325 	struct hrtimer_sleeper to;
326 	struct futex_vector *futexv;
327 	int ret;
328 
329 	/* This syscall supports no flags for now */
330 	if (flags)
331 		return -EINVAL;
332 
333 	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
334 		return -EINVAL;
335 
336 	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
337 		return ret;
338 
339 	futexv = kzalloc_objs(*futexv, nr_futexes);
340 	if (!futexv) {
341 		ret = -ENOMEM;
342 		goto destroy_timer;
343 	}
344 
345 	ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
346 				NULL);
347 	if (!ret)
348 		ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
349 
350 	kfree(futexv);
351 
352 destroy_timer:
353 	if (timeout)
354 		futex2_destroy_timeout(&to);
355 	return ret;
356 }
357 
358 /*
359  * sys_futex_wake - Wake a number of futexes
360  * @uaddr:	Address of the futex(es) to wake
361  * @mask:	bitmask
362  * @nr:		Number of the futexes to wake
363  * @flags:	FUTEX2 flags
364  *
365  * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
366  * futex2 family of calls.
367  */
368 
369 SYSCALL_DEFINE4(futex_wake,
370 		void __user *, uaddr,
371 		unsigned long, mask,
372 		int, nr,
373 		unsigned int, flags)
374 {
375 	if (flags & ~FUTEX2_VALID_MASK)
376 		return -EINVAL;
377 
378 	flags = futex2_to_flags(flags);
379 	if (!futex_flags_valid(flags))
380 		return -EINVAL;
381 
382 	if (!futex_validate_input(flags, mask))
383 		return -EINVAL;
384 
385 	return futex_wake(uaddr, FLAGS_STRICT | flags, NULL, nr, mask);
386 }
387 
388 /*
389  * sys_futex_wait - Wait on a futex
390  * @uaddr:	Address of the futex to wait on
391  * @val:	Value of @uaddr
392  * @mask:	bitmask
393  * @flags:	FUTEX2 flags
394  * @timeout:	Optional absolute timeout
395  * @clockid:	Clock to be used for the timeout, realtime or monotonic
396  *
397  * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
398  * futex2 familiy of calls.
399  */
400 
401 SYSCALL_DEFINE6(futex_wait,
402 		void __user *, uaddr,
403 		unsigned long, val,
404 		unsigned long, mask,
405 		unsigned int, flags,
406 		struct __kernel_timespec __user *, timeout,
407 		clockid_t, clockid)
408 {
409 	struct hrtimer_sleeper to;
410 	int ret;
411 
412 	if (flags & ~FUTEX2_VALID_MASK)
413 		return -EINVAL;
414 
415 	flags = futex2_to_flags(flags);
416 	if (!futex_flags_valid(flags))
417 		return -EINVAL;
418 
419 	if (!futex_validate_input(flags, val) ||
420 	    !futex_validate_input(flags, mask))
421 		return -EINVAL;
422 
423 	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
424 		return ret;
425 
426 	ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
427 
428 	if (timeout)
429 		futex2_destroy_timeout(&to);
430 
431 	return ret;
432 }
433 
434 /*
435  * sys_futex_requeue - Requeue a waiter from one futex to another
436  * @waiters:	array describing the source and destination futex
437  * @flags:	unused
438  * @nr_wake:	number of futexes to wake
439  * @nr_requeue:	number of futexes to requeue
440  *
441  * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
442  * futex2 family of calls.
443  */
444 
445 SYSCALL_DEFINE4(futex_requeue,
446 		struct futex_waitv __user *, waiters,
447 		unsigned int, flags,
448 		int, nr_wake,
449 		int, nr_requeue)
450 {
451 	struct futex_vector futexes[2];
452 	u32 cmpval;
453 	int ret;
454 
455 	if (flags)
456 		return -EINVAL;
457 
458 	if (!waiters)
459 		return -EINVAL;
460 
461 	ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
462 	if (ret)
463 		return ret;
464 
465 	/*
466 	 * For now mandate both flags are identical, like the sys_futex()
467 	 * interface has. If/when we merge the variable sized futex support,
468 	 * that patch can modify this test to allow a difference in size.
469 	 */
470 	if (futexes[0].w.flags != futexes[1].w.flags)
471 		return -EINVAL;
472 
473 	cmpval = futexes[0].w.val;
474 
475 	return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
476 			     u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
477 			     nr_wake, nr_requeue, &cmpval, 0);
478 }
479 
480 #ifdef CONFIG_COMPAT
481 COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head,
482 		       compat_size_t, len)
483 {
484 	if (unlikely(len != sizeof(*head)))
485 		return -EINVAL;
486 
487 	current->futex.compat_robust_list = head;
488 	return 0;
489 }
490 
491 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
492 			compat_uptr_t __user *, head_ptr,
493 			compat_size_t __user *, len_ptr)
494 {
495 	struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
496 
497 	if (IS_ERR(head))
498 		return PTR_ERR(head);
499 
500 	if (put_user(sizeof(*head), len_ptr))
501 		return -EFAULT;
502 	return put_user(ptr_to_compat(head), head_ptr);
503 }
504 #endif /* CONFIG_COMPAT */
505 
506 #ifdef CONFIG_COMPAT_32BIT_TIME
507 SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
508 		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
509 		u32, val3)
510 {
511 	int ret, cmd = op & FUTEX_CMD_MASK;
512 	ktime_t t, *tp = NULL;
513 	struct timespec64 ts;
514 
515 	if (utime && futex_cmd_has_timeout(cmd)) {
516 		if (get_old_timespec32(&ts, utime))
517 			return -EFAULT;
518 		ret = futex_init_timeout(cmd, op, &ts, &t);
519 		if (ret)
520 			return ret;
521 		tp = &t;
522 	}
523 
524 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
525 }
526 #endif /* CONFIG_COMPAT_32BIT_TIME */
527