xref: /linux/kernel/rseq.c (revision 99d2592023e5d0a31f5f5a83c694df48239a1e6c)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Restartable sequences system call
4  *
5  * Copyright (C) 2015, Google, Inc.,
6  * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
7  * Copyright (C) 2015-2018, EfficiOS Inc.,
8  * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
9  */
10 
11 /*
12  * Restartable sequences are a lightweight interface that allows
13  * user-level code to be executed atomically relative to scheduler
14  * preemption and signal delivery. Typically used for implementing
15  * per-cpu operations.
16  *
17  * It allows user-space to perform update operations on per-cpu data
18  * without requiring heavy-weight atomic operations.
19  *
20  * Detailed algorithm of rseq user-space assembly sequences:
21  *
22  *                     init(rseq_cs)
23  *                     cpu = TLS->rseq::cpu_id_start
24  *   [1]               TLS->rseq::rseq_cs = rseq_cs
25  *   [start_ip]        ----------------------------
26  *   [2]               if (cpu != TLS->rseq::cpu_id)
27  *                             goto abort_ip;
28  *   [3]               <last_instruction_in_cs>
29  *   [post_commit_ip]  ----------------------------
30  *
31  *   The address of jump target abort_ip must be outside the critical
32  *   region, i.e.:
33  *
34  *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
35  *
36  *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
37  *   userspace that can handle being interrupted between any of those
38  *   instructions, and then resumed to the abort_ip.
39  *
40  *   1.  Userspace stores the address of the struct rseq_cs assembly
41  *       block descriptor into the rseq_cs field of the registered
42  *       struct rseq TLS area. This update is performed through a single
43  *       store within the inline assembly instruction sequence.
44  *       [start_ip]
45  *
46  *   2.  Userspace tests to check whether the current cpu_id field match
47  *       the cpu number loaded before start_ip, branching to abort_ip
48  *       in case of a mismatch.
49  *
50  *       If the sequence is preempted or interrupted by a signal
51  *       at or after start_ip and before post_commit_ip, then the kernel
52  *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
53  *       ip to abort_ip before returning to user-space, so the preempted
54  *       execution resumes at abort_ip.
55  *
56  *   3.  Userspace critical section final instruction before
57  *       post_commit_ip is the commit. The critical section is
58  *       self-terminating.
59  *       [post_commit_ip]
60  *
61  *   4.  <success>
62  *
63  *   On failure at [2], or if interrupted by preempt or signal delivery
64  *   between [1] and [3]:
65  *
66  *       [abort_ip]
67  *   F1. <failure>
68  */
69 
70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */
71 #define RSEQ_BUILD_SLOW_PATH
72 
73 #include <linux/debugfs.h>
74 #include <linux/prctl.h>
75 #include <linux/ratelimit.h>
76 #include <linux/rseq_entry.h>
77 #include <linux/sched.h>
78 #include <linux/syscalls.h>
79 #include <linux/uaccess.h>
80 #include <linux/types.h>
81 #include <asm/ptrace.h>
82 
83 #define CREATE_TRACE_POINTS
84 #include <trace/events/rseq.h>
85 
86 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
87 
88 static inline void rseq_control_debug(bool on)
89 {
90 	if (on)
91 		static_branch_enable(&rseq_debug_enabled);
92 	else
93 		static_branch_disable(&rseq_debug_enabled);
94 }
95 
96 static int __init rseq_setup_debug(char *str)
97 {
98 	bool on;
99 
100 	if (kstrtobool(str, &on))
101 		return -EINVAL;
102 	rseq_control_debug(on);
103 	return 1;
104 }
105 __setup("rseq_debug=", rseq_setup_debug);
106 
107 #ifdef CONFIG_TRACEPOINTS
108 /*
109  * Out of line, so the actual update functions can be in a header to be
110  * inlined into the exit to user code.
111  */
112 void __rseq_trace_update(struct task_struct *t)
113 {
114 	trace_rseq_update(t);
115 }
116 
117 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
118 			   unsigned long offset, unsigned long abort_ip)
119 {
120 	trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
121 }
122 #endif /* CONFIG_TRACEPOINTS */
123 
124 #ifdef CONFIG_DEBUG_FS
125 #ifdef CONFIG_RSEQ_STATS
126 DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
127 
128 static int rseq_stats_show(struct seq_file *m, void *p)
129 {
130 	struct rseq_stats stats = { };
131 	unsigned int cpu;
132 
133 	for_each_possible_cpu(cpu) {
134 		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
135 		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
136 		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
137 		stats.fastpath	+= data_race(per_cpu(rseq_stats.fastpath, cpu));
138 		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
139 		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
140 		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
141 		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
142 		if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
143 			stats.s_granted	+= data_race(per_cpu(rseq_stats.s_granted, cpu));
144 			stats.s_expired	+= data_race(per_cpu(rseq_stats.s_expired, cpu));
145 			stats.s_revoked	+= data_race(per_cpu(rseq_stats.s_revoked, cpu));
146 			stats.s_yielded	+= data_race(per_cpu(rseq_stats.s_yielded, cpu));
147 			stats.s_aborted	+= data_race(per_cpu(rseq_stats.s_aborted, cpu));
148 		}
149 	}
150 
151 	seq_printf(m, "exit:   %16lu\n", stats.exit);
152 	seq_printf(m, "signal: %16lu\n", stats.signal);
153 	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
154 	seq_printf(m, "fastp:  %16lu\n", stats.fastpath);
155 	seq_printf(m, "ids:    %16lu\n", stats.ids);
156 	seq_printf(m, "cs:     %16lu\n", stats.cs);
157 	seq_printf(m, "clear:  %16lu\n", stats.clear);
158 	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
159 	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
160 		seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
161 		seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
162 		seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
163 		seq_printf(m, "syield: %16lu\n", stats.s_yielded);
164 		seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
165 	}
166 	return 0;
167 }
168 
169 static int rseq_stats_open(struct inode *inode, struct file *file)
170 {
171 	return single_open(file, rseq_stats_show, inode->i_private);
172 }
173 
174 static const struct file_operations stat_ops = {
175 	.open		= rseq_stats_open,
176 	.read		= seq_read,
177 	.llseek		= seq_lseek,
178 	.release	= single_release,
179 };
180 
181 static int __init rseq_stats_init(struct dentry *root_dir)
182 {
183 	debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
184 	return 0;
185 }
186 #else
187 static inline void rseq_stats_init(struct dentry *root_dir) { }
188 #endif /* CONFIG_RSEQ_STATS */
189 
190 static int rseq_debug_show(struct seq_file *m, void *p)
191 {
192 	bool on = static_branch_unlikely(&rseq_debug_enabled);
193 
194 	seq_printf(m, "%d\n", on);
195 	return 0;
196 }
197 
198 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
199 			    size_t count, loff_t *ppos)
200 {
201 	bool on;
202 
203 	if (kstrtobool_from_user(ubuf, count, &on))
204 		return -EINVAL;
205 
206 	rseq_control_debug(on);
207 	return count;
208 }
209 
210 static int rseq_debug_open(struct inode *inode, struct file *file)
211 {
212 	return single_open(file, rseq_debug_show, inode->i_private);
213 }
214 
215 static const struct file_operations debug_ops = {
216 	.open		= rseq_debug_open,
217 	.read		= seq_read,
218 	.write		= rseq_debug_write,
219 	.llseek		= seq_lseek,
220 	.release	= single_release,
221 };
222 
223 static int __init rseq_debugfs_init(void)
224 {
225 	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
226 
227 	debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
228 	rseq_stats_init(root_dir);
229 	return 0;
230 }
231 __initcall(rseq_debugfs_init);
232 #endif /* CONFIG_DEBUG_FS */
233 
234 static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
235 {
236 	return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
237 }
238 
239 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
240 {
241 	struct rseq __user *urseq = t->rseq.usrptr;
242 	u64 csaddr;
243 
244 	scoped_user_read_access(urseq, efault)
245 		unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
246 	if (likely(!csaddr))
247 		return true;
248 	return rseq_update_user_cs(t, regs, csaddr);
249 efault:
250 	return false;
251 }
252 
253 static void rseq_slowpath_update_usr(struct pt_regs *regs)
254 {
255 	/*
256 	 * Preserve rseq state and user_irq state. The generic entry code
257 	 * clears user_irq on the way out, the non-generic entry
258 	 * architectures are not having user_irq.
259 	 */
260 	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
261 	struct task_struct *t = current;
262 	struct rseq_ids ids;
263 	u32 node_id;
264 	bool event;
265 
266 	if (unlikely(t->flags & PF_EXITING))
267 		return;
268 
269 	rseq_stat_inc(rseq_stats.slowpath);
270 
271 	/*
272 	 * Read and clear the event pending bit first. If the task
273 	 * was not preempted or migrated or a signal is on the way,
274 	 * there is no point in doing any of the heavy lifting here
275 	 * on production kernels. In that case TIF_NOTIFY_RESUME
276 	 * was raised by some other functionality.
277 	 *
278 	 * This is correct because the read/clear operation is
279 	 * guarded against scheduler preemption, which makes it CPU
280 	 * local atomic. If the task is preempted right after
281 	 * re-enabling preemption then TIF_NOTIFY_RESUME is set
282 	 * again and this function is invoked another time _before_
283 	 * the task is able to return to user mode.
284 	 *
285 	 * On a debug kernel, invoke the fixup code unconditionally
286 	 * with the result handed in to allow the detection of
287 	 * inconsistencies.
288 	 */
289 	scoped_guard(irq) {
290 		event = t->rseq.event.sched_switch;
291 		t->rseq.event.all &= evt_mask.all;
292 		ids.cpu_id = task_cpu(t);
293 		ids.mm_cid = task_mm_cid(t);
294 	}
295 
296 	if (!event)
297 		return;
298 
299 	node_id = cpu_to_node(ids.cpu_id);
300 
301 	if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
302 		/*
303 		 * Clear the errors just in case this might survive magically, but
304 		 * leave the rest intact.
305 		 */
306 		t->rseq.event.error = 0;
307 		force_sig(SIGSEGV);
308 	}
309 }
310 
311 void __rseq_handle_slowpath(struct pt_regs *regs)
312 {
313 	/*
314 	 * If invoked from hypervisors before entering the guest via
315 	 * resume_user_mode_work(), then @regs is a NULL pointer.
316 	 *
317 	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
318 	 * it before returning from the ioctl() to user space when
319 	 * rseq_event.sched_switch is set.
320 	 *
321 	 * So it's safe to ignore here instead of pointlessly updating it
322 	 * in the vcpu_run() loop.
323 	 */
324 	if (!regs)
325 		return;
326 
327 	rseq_slowpath_update_usr(regs);
328 }
329 
330 void __rseq_signal_deliver(int sig, struct pt_regs *regs)
331 {
332 	rseq_stat_inc(rseq_stats.signal);
333 	/*
334 	 * Don't update IDs, they are handled on exit to user if
335 	 * necessary. The important thing is to abort a critical section of
336 	 * the interrupted context as after this point the instruction
337 	 * pointer in @regs points to the signal handler.
338 	 */
339 	if (unlikely(!rseq_handle_cs(current, regs))) {
340 		/*
341 		 * Clear the errors just in case this might survive
342 		 * magically, but leave the rest intact.
343 		 */
344 		current->rseq.event.error = 0;
345 		force_sigsegv(sig);
346 	}
347 }
348 
349 /*
350  * Terminate the process if a syscall is issued within a restartable
351  * sequence.
352  */
353 void __rseq_debug_syscall_return(struct pt_regs *regs)
354 {
355 	struct task_struct *t = current;
356 	u64 csaddr;
357 
358 	if (!t->rseq.event.has_rseq)
359 		return;
360 	if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
361 		goto fail;
362 	if (likely(!csaddr))
363 		return;
364 	if (unlikely(csaddr >= TASK_SIZE))
365 		goto fail;
366 	if (rseq_debug_update_user_cs(t, regs, csaddr))
367 		return;
368 fail:
369 	force_sig(SIGSEGV);
370 }
371 
372 #ifdef CONFIG_DEBUG_RSEQ
373 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */
374 void rseq_syscall(struct pt_regs *regs)
375 {
376 	__rseq_debug_syscall_return(regs);
377 }
378 #endif
379 
380 static bool rseq_reset_ids(void)
381 {
382 	struct rseq_ids ids = {
383 		.cpu_id		= RSEQ_CPU_ID_UNINITIALIZED,
384 		.mm_cid		= 0,
385 	};
386 
387 	/*
388 	 * If this fails, terminate it because this leaves the kernel in
389 	 * stupid state as exit to user space will try to fixup the ids
390 	 * again.
391 	 */
392 	if (rseq_set_ids(current, &ids, 0))
393 		return true;
394 
395 	force_sig(SIGSEGV);
396 	return false;
397 }
398 
399 /* The original rseq structure size (including padding) is 32 bytes. */
400 #define ORIG_RSEQ_SIZE		32
401 
402 /*
403  * sys_rseq - setup restartable sequences for caller thread.
404  */
405 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
406 {
407 	u32 rseqfl = 0;
408 
409 	if (flags & RSEQ_FLAG_UNREGISTER) {
410 		if (flags & ~RSEQ_FLAG_UNREGISTER)
411 			return -EINVAL;
412 		/* Unregister rseq for current thread. */
413 		if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
414 			return -EINVAL;
415 		if (rseq_len != current->rseq.len)
416 			return -EINVAL;
417 		if (current->rseq.sig != sig)
418 			return -EPERM;
419 		if (!rseq_reset_ids())
420 			return -EFAULT;
421 		rseq_reset(current);
422 		return 0;
423 	}
424 
425 	if (unlikely(flags))
426 		return -EINVAL;
427 
428 	if (current->rseq.usrptr) {
429 		/*
430 		 * If rseq is already registered, check whether
431 		 * the provided address differs from the prior
432 		 * one.
433 		 */
434 		if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
435 			return -EINVAL;
436 		if (current->rseq.sig != sig)
437 			return -EPERM;
438 		/* Already registered. */
439 		return -EBUSY;
440 	}
441 
442 	/*
443 	 * If there was no rseq previously registered, ensure the provided rseq
444 	 * is properly aligned, as communcated to user-space through the ELF
445 	 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
446 	 * size, the required alignment is the original struct rseq alignment.
447 	 *
448 	 * In order to be valid, rseq_len is either the original rseq size, or
449 	 * large enough to contain all supported fields, as communicated to
450 	 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
451 	 */
452 	if (rseq_len < ORIG_RSEQ_SIZE ||
453 	    (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
454 	    (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
455 					    rseq_len < offsetof(struct rseq, end))))
456 		return -EINVAL;
457 	if (!access_ok(rseq, rseq_len))
458 		return -EFAULT;
459 
460 	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
461 		rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
462 
463 	scoped_user_write_access(rseq, efault) {
464 		/*
465 		 * If the rseq_cs pointer is non-NULL on registration, clear it to
466 		 * avoid a potential segfault on return to user-space. The proper thing
467 		 * to do would have been to fail the registration but this would break
468 		 * older libcs that reuse the rseq area for new threads without
469 		 * clearing the fields. Don't bother reading it, just reset it.
470 		 */
471 		unsafe_put_user(0UL, &rseq->rseq_cs, efault);
472 		unsafe_put_user(rseqfl, &rseq->flags, efault);
473 		/* Initialize IDs in user space */
474 		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
475 		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
476 		unsafe_put_user(0U, &rseq->node_id, efault);
477 		unsafe_put_user(0U, &rseq->mm_cid, efault);
478 		unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
479 	}
480 
481 	/*
482 	 * Activate the registration by setting the rseq area address, length
483 	 * and signature in the task struct.
484 	 */
485 	current->rseq.usrptr = rseq;
486 	current->rseq.len = rseq_len;
487 	current->rseq.sig = sig;
488 
489 	/*
490 	 * If rseq was previously inactive, and has just been
491 	 * registered, ensure the cpu_id_start and cpu_id fields
492 	 * are updated before returning to user-space.
493 	 */
494 	current->rseq.event.has_rseq = true;
495 	rseq_force_update();
496 	return 0;
497 
498 efault:
499 	return -EFAULT;
500 }
501 
502 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
503 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
504 
505 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
506 {
507 	switch (arg2) {
508 	case PR_RSEQ_SLICE_EXTENSION_GET:
509 		if (arg3)
510 			return -EINVAL;
511 		return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
512 
513 	case PR_RSEQ_SLICE_EXTENSION_SET: {
514 		u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
515 		bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
516 
517 		if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
518 			return -EINVAL;
519 		if (!rseq_slice_extension_enabled())
520 			return -ENOTSUPP;
521 		if (!current->rseq.usrptr)
522 			return -ENXIO;
523 
524 		/* No change? */
525 		if (enable == !!current->rseq.slice.state.enabled)
526 			return 0;
527 
528 		if (get_user(rflags, &current->rseq.usrptr->flags))
529 			goto die;
530 
531 		if (current->rseq.slice.state.enabled)
532 			valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
533 
534 		if ((rflags & valid) != valid)
535 			goto die;
536 
537 		rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
538 		rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
539 		if (enable)
540 			rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
541 
542 		if (put_user(rflags, &current->rseq.usrptr->flags))
543 			goto die;
544 
545 		current->rseq.slice.state.enabled = enable;
546 		return 0;
547 	}
548 	default:
549 		return -EINVAL;
550 	}
551 die:
552 	force_sig(SIGSEGV);
553 	return -EFAULT;
554 }
555 
556 /**
557  * sys_rseq_slice_yield - yield the current processor side effect free if a
558  *			  task granted with a time slice extension is done with
559  *			  the critical work before being forced out.
560  *
561  * Return: 1 if the task successfully yielded the CPU within the granted slice.
562  *         0 if the slice extension was either never granted or was revoked by
563  *	     going over the granted extension, using a syscall other than this one
564  *	     or being scheduled out earlier due to a subsequent interrupt.
565  *
566  * The syscall does not schedule because the syscall entry work immediately
567  * relinquishes the CPU and schedules if required.
568  */
569 SYSCALL_DEFINE0(rseq_slice_yield)
570 {
571 	int yielded = !!current->rseq.slice.yielded;
572 
573 	current->rseq.slice.yielded = 0;
574 	return yielded;
575 }
576 
577 static int __init rseq_slice_cmdline(char *str)
578 {
579 	bool on;
580 
581 	if (kstrtobool(str, &on))
582 		return 0;
583 
584 	if (!on)
585 		static_branch_disable(&rseq_slice_extension_key);
586 	return 1;
587 }
588 __setup("rseq_slice_ext=", rseq_slice_cmdline);
589 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */
590