1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Restartable sequences system call
4 *
5 * Copyright (C) 2015, Google, Inc.,
6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
7 * Copyright (C) 2015-2018, EfficiOS Inc.,
8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
9 */
10
11 /*
12 * Restartable sequences are a lightweight interface that allows
13 * user-level code to be executed atomically relative to scheduler
14 * preemption and signal delivery. Typically used for implementing
15 * per-cpu operations.
16 *
17 * It allows user-space to perform update operations on per-cpu data
18 * without requiring heavy-weight atomic operations.
19 *
20 * Detailed algorithm of rseq user-space assembly sequences:
21 *
22 * init(rseq_cs)
23 * cpu = TLS->rseq::cpu_id_start
24 * [1] TLS->rseq::rseq_cs = rseq_cs
25 * [start_ip] ----------------------------
26 * [2] if (cpu != TLS->rseq::cpu_id)
27 * goto abort_ip;
28 * [3] <last_instruction_in_cs>
29 * [post_commit_ip] ----------------------------
30 *
31 * The address of jump target abort_ip must be outside the critical
32 * region, i.e.:
33 *
34 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip]
35 *
36 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in
37 * userspace that can handle being interrupted between any of those
38 * instructions, and then resumed to the abort_ip.
39 *
40 * 1. Userspace stores the address of the struct rseq_cs assembly
41 * block descriptor into the rseq_cs field of the registered
42 * struct rseq TLS area. This update is performed through a single
43 * store within the inline assembly instruction sequence.
44 * [start_ip]
45 *
46 * 2. Userspace tests to check whether the current cpu_id field match
47 * the cpu number loaded before start_ip, branching to abort_ip
48 * in case of a mismatch.
49 *
50 * If the sequence is preempted or interrupted by a signal
51 * at or after start_ip and before post_commit_ip, then the kernel
52 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
53 * ip to abort_ip before returning to user-space, so the preempted
54 * execution resumes at abort_ip.
55 *
56 * 3. Userspace critical section final instruction before
57 * post_commit_ip is the commit. The critical section is
58 * self-terminating.
59 * [post_commit_ip]
60 *
61 * 4. <success>
62 *
63 * On failure at [2], or if interrupted by preempt or signal delivery
64 * between [1] and [3]:
65 *
66 * [abort_ip]
67 * F1. <failure>
68 */
69
70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */
71 #define RSEQ_BUILD_SLOW_PATH
72
73 #include <linux/debugfs.h>
74 #include <linux/hrtimer.h>
75 #include <linux/percpu.h>
76 #include <linux/prctl.h>
77 #include <linux/ratelimit.h>
78 #include <linux/rseq_entry.h>
79 #include <linux/sched.h>
80 #include <linux/syscalls.h>
81 #include <linux/uaccess.h>
82 #include <linux/types.h>
83 #include <linux/rseq.h>
84 #include <asm/ptrace.h>
85
86 #define CREATE_TRACE_POINTS
87 #include <trace/events/rseq.h>
88
89 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
90
rseq_control_debug(bool on)91 static inline void rseq_control_debug(bool on)
92 {
93 if (on)
94 static_branch_enable(&rseq_debug_enabled);
95 else
96 static_branch_disable(&rseq_debug_enabled);
97 }
98
rseq_setup_debug(char * str)99 static int __init rseq_setup_debug(char *str)
100 {
101 bool on;
102
103 if (kstrtobool(str, &on))
104 return -EINVAL;
105 rseq_control_debug(on);
106 return 1;
107 }
108 __setup("rseq_debug=", rseq_setup_debug);
109
110 #ifdef CONFIG_TRACEPOINTS
111 /*
112 * Out of line, so the actual update functions can be in a header to be
113 * inlined into the exit to user code.
114 */
__rseq_trace_update(struct task_struct * t)115 void __rseq_trace_update(struct task_struct *t)
116 {
117 trace_rseq_update(t);
118 }
119
__rseq_trace_ip_fixup(unsigned long ip,unsigned long start_ip,unsigned long offset,unsigned long abort_ip)120 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
121 unsigned long offset, unsigned long abort_ip)
122 {
123 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
124 }
125 #endif /* CONFIG_TRACEPOINTS */
126
127 #ifdef CONFIG_RSEQ_STATS
128 DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
129
rseq_stats_show(struct seq_file * m,void * p)130 static int rseq_stats_show(struct seq_file *m, void *p)
131 {
132 struct rseq_stats stats = { };
133 unsigned int cpu;
134
135 for_each_possible_cpu(cpu) {
136 stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
137 stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
138 stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
139 stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu));
140 stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
141 stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
142 stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
143 stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
144 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
145 stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu));
146 stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu));
147 stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu));
148 stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu));
149 stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu));
150 }
151 }
152
153 seq_printf(m, "exit: %16lu\n", stats.exit);
154 seq_printf(m, "signal: %16lu\n", stats.signal);
155 seq_printf(m, "slowp: %16lu\n", stats.slowpath);
156 seq_printf(m, "fastp: %16lu\n", stats.fastpath);
157 seq_printf(m, "ids: %16lu\n", stats.ids);
158 seq_printf(m, "cs: %16lu\n", stats.cs);
159 seq_printf(m, "clear: %16lu\n", stats.clear);
160 seq_printf(m, "fixup: %16lu\n", stats.fixup);
161 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
162 seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
163 seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
164 seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
165 seq_printf(m, "syield: %16lu\n", stats.s_yielded);
166 seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
167 }
168 return 0;
169 }
170
rseq_stats_open(struct inode * inode,struct file * file)171 static int rseq_stats_open(struct inode *inode, struct file *file)
172 {
173 return single_open(file, rseq_stats_show, inode->i_private);
174 }
175
176 static const struct file_operations stat_ops = {
177 .open = rseq_stats_open,
178 .read = seq_read,
179 .llseek = seq_lseek,
180 .release = single_release,
181 };
182
rseq_stats_init(struct dentry * root_dir)183 static int __init rseq_stats_init(struct dentry *root_dir)
184 {
185 debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
186 return 0;
187 }
188 #else
rseq_stats_init(struct dentry * root_dir)189 static inline void rseq_stats_init(struct dentry *root_dir) { }
190 #endif /* CONFIG_RSEQ_STATS */
191
rseq_debug_show(struct seq_file * m,void * p)192 static int rseq_debug_show(struct seq_file *m, void *p)
193 {
194 bool on = static_branch_unlikely(&rseq_debug_enabled);
195
196 seq_printf(m, "%d\n", on);
197 return 0;
198 }
199
rseq_debug_write(struct file * file,const char __user * ubuf,size_t count,loff_t * ppos)200 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
201 size_t count, loff_t *ppos)
202 {
203 bool on;
204
205 if (kstrtobool_from_user(ubuf, count, &on))
206 return -EINVAL;
207
208 rseq_control_debug(on);
209 return count;
210 }
211
rseq_debug_open(struct inode * inode,struct file * file)212 static int rseq_debug_open(struct inode *inode, struct file *file)
213 {
214 return single_open(file, rseq_debug_show, inode->i_private);
215 }
216
217 static const struct file_operations debug_ops = {
218 .open = rseq_debug_open,
219 .read = seq_read,
220 .write = rseq_debug_write,
221 .llseek = seq_lseek,
222 .release = single_release,
223 };
224
225 static void rseq_slice_ext_init(struct dentry *root_dir);
226
rseq_debugfs_init(void)227 static int __init rseq_debugfs_init(void)
228 {
229 struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
230
231 debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
232 rseq_stats_init(root_dir);
233 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
234 rseq_slice_ext_init(root_dir);
235 return 0;
236 }
237 __initcall(rseq_debugfs_init);
238
rseq_handle_cs(struct task_struct * t,struct pt_regs * regs)239 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
240 {
241 struct rseq __user *urseq = t->rseq.usrptr;
242 u64 csaddr;
243
244 scoped_user_read_access(urseq, efault)
245 unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
246 if (likely(!csaddr))
247 return true;
248 return rseq_update_user_cs(t, regs, csaddr);
249 efault:
250 return false;
251 }
252
rseq_slowpath_update_usr(struct pt_regs * regs)253 static void rseq_slowpath_update_usr(struct pt_regs *regs)
254 {
255 /*
256 * Preserve has_rseq and user_irq state. The generic entry code clears
257 * user_irq on the way out, the non-generic entry architectures are not
258 * setting user_irq.
259 */
260 const struct rseq_event evt_mask = {
261 .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK,
262 .user_irq = true,
263 };
264 struct task_struct *t = current;
265 struct rseq_ids ids;
266 bool event;
267
268 if (unlikely(t->flags & PF_EXITING))
269 return;
270
271 rseq_stat_inc(rseq_stats.slowpath);
272
273 /*
274 * Read and clear the event pending bit first. If the task
275 * was not preempted or migrated or a signal is on the way,
276 * there is no point in doing any of the heavy lifting here
277 * on production kernels. In that case TIF_NOTIFY_RESUME
278 * was raised by some other functionality.
279 *
280 * This is correct because the read/clear operation is
281 * guarded against scheduler preemption, which makes it CPU
282 * local atomic. If the task is preempted right after
283 * re-enabling preemption then TIF_NOTIFY_RESUME is set
284 * again and this function is invoked another time _before_
285 * the task is able to return to user mode.
286 *
287 * On a debug kernel, invoke the fixup code unconditionally
288 * with the result handed in to allow the detection of
289 * inconsistencies.
290 */
291 scoped_guard(irq) {
292 event = t->rseq.event.sched_switch;
293 t->rseq.event.all &= evt_mask.all;
294 ids.cpu_id = task_cpu(t);
295 ids.mm_cid = task_mm_cid(t);
296 }
297
298 if (!event)
299 return;
300
301 ids.node_id = cpu_to_node(ids.cpu_id);
302
303 if (unlikely(!rseq_update_usr(t, regs, &ids))) {
304 /*
305 * Clear the errors just in case this might survive magically, but
306 * leave the rest intact.
307 */
308 t->rseq.event.error = 0;
309 force_sig(SIGSEGV);
310 }
311 }
312
__rseq_handle_slowpath(struct pt_regs * regs)313 void __rseq_handle_slowpath(struct pt_regs *regs)
314 {
315 /*
316 * If invoked from hypervisors before entering the guest via
317 * resume_user_mode_work(), then @regs is a NULL pointer.
318 *
319 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
320 * it before returning from the ioctl() to user space when
321 * rseq_event.sched_switch is set.
322 *
323 * So it's safe to ignore here instead of pointlessly updating it
324 * in the vcpu_run() loop.
325 */
326 if (!regs)
327 return;
328
329 rseq_slowpath_update_usr(regs);
330 }
331
__rseq_signal_deliver(int sig,struct pt_regs * regs)332 void __rseq_signal_deliver(int sig, struct pt_regs *regs)
333 {
334 rseq_stat_inc(rseq_stats.signal);
335
336 /*
337 * Don't update IDs yet, they are handled on exit to user if
338 * necessary. The important thing is to abort a critical section of
339 * the interrupted context as after this point the instruction
340 * pointer in @regs points to the signal handler.
341 */
342 if (unlikely(!rseq_handle_cs(current, regs))) {
343 /*
344 * Clear the errors just in case this might survive
345 * magically, but leave the rest intact.
346 */
347 current->rseq.event.error = 0;
348 force_sigsegv(sig);
349 }
350
351 /*
352 * In legacy mode, force the update of IDs before returning to user
353 * space to stay compatible.
354 */
355 if (!rseq_v2(current))
356 rseq_force_update();
357 }
358
359 /*
360 * Terminate the process if a syscall is issued within a restartable
361 * sequence.
362 */
__rseq_debug_syscall_return(struct pt_regs * regs)363 void __rseq_debug_syscall_return(struct pt_regs *regs)
364 {
365 struct task_struct *t = current;
366 u64 csaddr;
367
368 if (!t->rseq.event.has_rseq)
369 return;
370 if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
371 goto fail;
372 if (likely(!csaddr))
373 return;
374 if (unlikely(csaddr >= TASK_SIZE))
375 goto fail;
376 if (rseq_debug_update_user_cs(t, regs, csaddr))
377 return;
378 fail:
379 force_sig(SIGSEGV);
380 }
381
382 #ifdef CONFIG_DEBUG_RSEQ
383 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */
rseq_syscall(struct pt_regs * regs)384 void rseq_syscall(struct pt_regs *regs)
385 {
386 __rseq_debug_syscall_return(regs);
387 }
388 #endif
389
rseq_reset_ids(void)390 static bool rseq_reset_ids(void)
391 {
392 struct rseq __user *rseq = current->rseq.usrptr;
393
394 /*
395 * If this fails, terminate it because this leaves the kernel in
396 * stupid state as exit to user space will try to fixup the ids
397 * again.
398 */
399 scoped_user_rw_access(rseq, efault) {
400 unsafe_put_user(0, &rseq->cpu_id_start, efault);
401 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
402 unsafe_put_user(0, &rseq->node_id, efault);
403 unsafe_put_user(0, &rseq->mm_cid, efault);
404 }
405 return true;
406
407 efault:
408 force_sig(SIGSEGV);
409 return false;
410 }
411
412 /* The original rseq structure size (including padding) is 32 bytes. */
413 #define ORIG_RSEQ_SIZE 32
414
rseq_register(struct rseq __user * rseq,u32 rseq_len,int flags,u32 sig)415 static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
416 {
417 u32 rseqfl = 0;
418 u8 version = 1;
419
420 if (!access_ok(rseq, rseq_len))
421 return -EFAULT;
422
423 /*
424 * Architectures, which use the generic IRQ entry code (at least) enable
425 * registrations with a size greater than the original v1 fixed sized
426 * @rseq_len, which has been validated already to utilize the optimized
427 * v2 ABI mode which also enables extended RSEQ features beyond MMCID.
428 */
429 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE)
430 version = 2;
431
432 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) {
433 if (rseq_slice_extension_enabled()) {
434 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
435 if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
436 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
437 }
438 }
439
440 scoped_user_write_access(rseq, efault) {
441 /*
442 * If the rseq_cs pointer is non-NULL on registration, clear it to
443 * avoid a potential segfault on return to user-space. The proper thing
444 * to do would have been to fail the registration but this would break
445 * older libcs that reuse the rseq area for new threads without
446 * clearing the fields. Don't bother reading it, just reset it.
447 */
448 unsafe_put_user(0UL, &rseq->rseq_cs, efault);
449 unsafe_put_user(rseqfl, &rseq->flags, efault);
450 /* Initialize IDs in user space */
451 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
452 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
453 unsafe_put_user(0U, &rseq->node_id, efault);
454 unsafe_put_user(0U, &rseq->mm_cid, efault);
455
456 /*
457 * All fields past mm_cid are only valid for non-legacy v2
458 * registrations.
459 */
460 if (version > 1) {
461 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
462 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
463 }
464 }
465
466 /*
467 * Activate the registration by setting the rseq area address, length
468 * and signature in the task struct.
469 */
470 current->rseq.usrptr = rseq;
471 current->rseq.len = rseq_len;
472 current->rseq.sig = sig;
473
474 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
475 current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
476 #endif
477
478 /*
479 * Ensure the cpu_id_start and cpu_id fields are updated before
480 * returning to user-space.
481 */
482 current->rseq.event.has_rseq = version;
483 rseq_force_update();
484 return 0;
485
486 efault:
487 return -EFAULT;
488 }
489
rseq_unregister(struct rseq __user * rseq,u32 rseq_len,int flags,u32 sig)490 static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
491 {
492 if (flags & ~RSEQ_FLAG_UNREGISTER)
493 return -EINVAL;
494 if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
495 return -EINVAL;
496 if (rseq_len != current->rseq.len)
497 return -EINVAL;
498 if (current->rseq.sig != sig)
499 return -EPERM;
500 if (!rseq_reset_ids())
501 return -EFAULT;
502 rseq_reset(current);
503 return 0;
504 }
505
rseq_reregister(struct rseq __user * rseq,u32 rseq_len,u32 sig)506 static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig)
507 {
508 /*
509 * If rseq is already registered, check whether the provided address
510 * differs from the prior one.
511 */
512 if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
513 return -EINVAL;
514 if (current->rseq.sig != sig)
515 return -EPERM;
516 /* Already registered. */
517 return -EBUSY;
518 }
519
rseq_length_valid(struct rseq __user * rseq,unsigned int rseq_len)520 static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len)
521 {
522 /*
523 * Ensure the provided rseq is properly aligned, as communicated to
524 * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If
525 * rseq_len is the original rseq size, the required alignment is the
526 * original struct rseq alignment.
527 *
528 * In order to be valid, rseq_len is either the original rseq size, or
529 * large enough to contain all supported fields, as communicated to
530 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
531 */
532 if (rseq_len < ORIG_RSEQ_SIZE)
533 return false;
534
535 if (rseq_len == ORIG_RSEQ_SIZE)
536 return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE);
537
538 return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) &&
539 rseq_len >= offsetof(struct rseq, end);
540 }
541
542 #define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
543
544 /*
545 * sys_rseq - Register or unregister restartable sequences for the caller thread.
546 */
SYSCALL_DEFINE4(rseq,struct rseq __user *,rseq,u32,rseq_len,int,flags,u32,sig)547 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
548 {
549 if (flags & RSEQ_FLAG_UNREGISTER)
550 return rseq_unregister(rseq, rseq_len, flags, sig);
551
552 if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED))
553 return -EINVAL;
554
555 if (current->rseq.usrptr)
556 return rseq_reregister(rseq, rseq_len, sig);
557
558 if (!rseq_length_valid(rseq, rseq_len))
559 return -EINVAL;
560
561 return rseq_register(rseq, rseq_len, flags, sig);
562 }
563
564 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
565 struct slice_timer {
566 struct hrtimer timer;
567 void *cookie;
568 };
569
570 static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC;
571 static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
572 unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min;
573 static DEFINE_PER_CPU(struct slice_timer, slice_timer);
574 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
575
576 /*
577 * When the timer expires and the task is still in user space, the return
578 * from interrupt will revoke the grant and schedule. If the task already
579 * entered the kernel via a syscall and the timer fires before the syscall
580 * work was able to cancel it, then depending on the preemption model this
581 * will either reschedule on return from interrupt or in the syscall work
582 * below.
583 */
rseq_slice_expired(struct hrtimer * tmr)584 static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
585 {
586 struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
587
588 /*
589 * Validate that the task which armed the timer is still on the
590 * CPU. It could have been scheduled out without canceling the
591 * timer.
592 */
593 if (st->cookie == current && current->rseq.slice.state.granted) {
594 rseq_stat_inc(rseq_stats.s_expired);
595 set_need_resched_current();
596 }
597 return HRTIMER_NORESTART;
598 }
599
__rseq_arm_slice_extension_timer(void)600 bool __rseq_arm_slice_extension_timer(void)
601 {
602 struct slice_timer *st = this_cpu_ptr(&slice_timer);
603 struct task_struct *curr = current;
604
605 lockdep_assert_irqs_disabled();
606
607 /*
608 * This check prevents a task, which got a time slice extension
609 * granted, from exceeding the maximum scheduling latency when the
610 * grant expired before going out to user space. Don't bother to
611 * clear the grant here, it will be cleaned up automatically before
612 * going out to user space after being scheduled back in.
613 */
614 if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
615 set_need_resched_current();
616 return true;
617 }
618
619 /*
620 * Store the task pointer as a cookie for comparison in the timer
621 * function. This is safe as the timer is CPU local and cannot be
622 * in the expiry function at this point.
623 */
624 st->cookie = curr;
625 hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
626 /* Arm the syscall entry work */
627 set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
628 return false;
629 }
630
rseq_cancel_slice_extension_timer(void)631 static void rseq_cancel_slice_extension_timer(void)
632 {
633 struct slice_timer *st = this_cpu_ptr(&slice_timer);
634
635 /*
636 * st->cookie can be safely read as preemption is disabled and the
637 * timer is CPU local.
638 *
639 * As this is most probably the first expiring timer, the cancel is
640 * expensive as it has to reprogram the hardware, but that's less
641 * expensive than going through a full hrtimer_interrupt() cycle
642 * for nothing.
643 *
644 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
645 * local and once the hrtimer code disabled interrupts the timer
646 * callback cannot be running.
647 */
648 if (st->cookie == current)
649 hrtimer_try_to_cancel(&st->timer);
650 }
651
rseq_slice_set_need_resched(struct task_struct * curr)652 static inline void rseq_slice_set_need_resched(struct task_struct *curr)
653 {
654 /*
655 * The interrupt guard is required to prevent inconsistent state in
656 * this case:
657 *
658 * set_tsk_need_resched()
659 * --> Interrupt
660 * wakeup()
661 * set_tsk_need_resched()
662 * set_preempt_need_resched()
663 * schedule_on_return()
664 * clear_tsk_need_resched()
665 * clear_preempt_need_resched()
666 * set_preempt_need_resched() <- Inconsistent state
667 *
668 * This is safe vs. a remote set of TIF_NEED_RESCHED because that
669 * only sets the already set bit and does not create inconsistent
670 * state.
671 */
672 scoped_guard(irq)
673 set_need_resched_current();
674 }
675
rseq_slice_validate_ctrl(u32 expected)676 static void rseq_slice_validate_ctrl(u32 expected)
677 {
678 u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all;
679 u32 uval;
680
681 if (get_user(uval, sctrl) || uval != expected)
682 force_sig(SIGSEGV);
683 }
684
685 /*
686 * Invoked from syscall entry if a time slice extension was granted and the
687 * kernel did not clear it before user space left the critical section.
688 *
689 * While the recommended way to relinquish the CPU side effect free is
690 * rseq_slice_yield(2), any syscall within a granted slice terminates the
691 * grant and immediately reschedules if required. This supports onion layer
692 * applications, where the code requesting the grant cannot control the
693 * code within the critical section.
694 */
rseq_syscall_enter_work(long syscall)695 void rseq_syscall_enter_work(long syscall)
696 {
697 struct task_struct *curr = current;
698 struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
699
700 clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
701
702 if (static_branch_unlikely(&rseq_debug_enabled))
703 rseq_slice_validate_ctrl(ctrl.all);
704
705 /*
706 * The kernel might have raced, revoked the grant and updated
707 * userspace, but kept the SLICE work set.
708 */
709 if (!ctrl.granted)
710 return;
711
712 /*
713 * Required to stabilize the per CPU timer pointer and to make
714 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
715 *
716 * Leaving the scope will reschedule on preemption models FULL,
717 * LAZY and RT if necessary.
718 */
719 scoped_guard(preempt) {
720 rseq_cancel_slice_extension_timer();
721 /*
722 * Now that preemption is disabled, quickly check whether
723 * the task was already rescheduled before arriving here.
724 */
725 if (!curr->rseq.event.sched_switch) {
726 rseq_slice_set_need_resched(curr);
727
728 if (syscall == __NR_rseq_slice_yield) {
729 rseq_stat_inc(rseq_stats.s_yielded);
730 /* Update the yielded state for syscall return */
731 curr->rseq.slice.yielded = 1;
732 } else {
733 rseq_stat_inc(rseq_stats.s_aborted);
734 }
735 }
736 }
737 /* Reschedule on NONE/VOLUNTARY preemption models */
738 cond_resched();
739
740 /* Clear the grant in kernel state and user space */
741 curr->rseq.slice.state.granted = false;
742 if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
743 force_sig(SIGSEGV);
744 }
745
rseq_slice_extension_prctl(unsigned long arg2,unsigned long arg3)746 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
747 {
748 switch (arg2) {
749 case PR_RSEQ_SLICE_EXTENSION_GET:
750 if (arg3)
751 return -EINVAL;
752 return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
753
754 case PR_RSEQ_SLICE_EXTENSION_SET: {
755 u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
756 bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
757
758 if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
759 return -EINVAL;
760 if (!rseq_slice_extension_enabled())
761 return -ENOTSUPP;
762 if (!current->rseq.usrptr)
763 return -ENXIO;
764 if (!rseq_v2(current))
765 return -ENOTSUPP;
766
767 /* No change? */
768 if (enable == !!current->rseq.slice.state.enabled)
769 return 0;
770
771 if (get_user(rflags, ¤t->rseq.usrptr->flags))
772 goto die;
773
774 if (current->rseq.slice.state.enabled)
775 valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
776
777 if ((rflags & valid) != valid)
778 goto die;
779
780 rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
781 rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
782 if (enable)
783 rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
784
785 if (put_user(rflags, ¤t->rseq.usrptr->flags))
786 goto die;
787
788 current->rseq.slice.state.enabled = enable;
789 return 0;
790 }
791 default:
792 return -EINVAL;
793 }
794 die:
795 force_sig(SIGSEGV);
796 return -EFAULT;
797 }
798
799 /**
800 * sys_rseq_slice_yield - yield the current processor side effect free if a
801 * task granted with a time slice extension is done with
802 * the critical work before being forced out.
803 *
804 * Return: 1 if the task successfully yielded the CPU within the granted slice.
805 * 0 if the slice extension was either never granted or was revoked by
806 * going over the granted extension, using a syscall other than this one
807 * or being scheduled out earlier due to a subsequent interrupt.
808 *
809 * The syscall does not schedule because the syscall entry work immediately
810 * relinquishes the CPU and schedules if required.
811 */
SYSCALL_DEFINE0(rseq_slice_yield)812 SYSCALL_DEFINE0(rseq_slice_yield)
813 {
814 int yielded = !!current->rseq.slice.yielded;
815
816 current->rseq.slice.yielded = 0;
817 return yielded;
818 }
819
rseq_slice_ext_show(struct seq_file * m,void * p)820 static int rseq_slice_ext_show(struct seq_file *m, void *p)
821 {
822 seq_printf(m, "%d\n", rseq_slice_ext_nsecs);
823 return 0;
824 }
825
rseq_slice_ext_write(struct file * file,const char __user * ubuf,size_t count,loff_t * ppos)826 static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf,
827 size_t count, loff_t *ppos)
828 {
829 unsigned int nsecs;
830
831 if (kstrtouint_from_user(ubuf, count, 10, &nsecs))
832 return -EINVAL;
833
834 if (nsecs < rseq_slice_ext_nsecs_min)
835 return -ERANGE;
836
837 if (nsecs > rseq_slice_ext_nsecs_max)
838 return -ERANGE;
839
840 rseq_slice_ext_nsecs = nsecs;
841
842 return count;
843 }
844
rseq_slice_ext_open(struct inode * inode,struct file * file)845 static int rseq_slice_ext_open(struct inode *inode, struct file *file)
846 {
847 return single_open(file, rseq_slice_ext_show, inode->i_private);
848 }
849
850 static const struct file_operations slice_ext_ops = {
851 .open = rseq_slice_ext_open,
852 .read = seq_read,
853 .write = rseq_slice_ext_write,
854 .llseek = seq_lseek,
855 .release = single_release,
856 };
857
rseq_slice_ext_init(struct dentry * root_dir)858 static void rseq_slice_ext_init(struct dentry *root_dir)
859 {
860 debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops);
861 }
862
rseq_slice_cmdline(char * str)863 static int __init rseq_slice_cmdline(char *str)
864 {
865 bool on;
866
867 if (kstrtobool(str, &on))
868 return 0;
869
870 if (!on)
871 static_branch_disable(&rseq_slice_extension_key);
872 return 1;
873 }
874 __setup("rseq_slice_ext=", rseq_slice_cmdline);
875
rseq_slice_init(void)876 static int __init rseq_slice_init(void)
877 {
878 unsigned int cpu;
879
880 for_each_possible_cpu(cpu) {
881 hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
882 CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
883 }
884 return 0;
885 }
886 device_initcall(rseq_slice_init);
887 #else
rseq_slice_ext_init(struct dentry * root_dir)888 static void rseq_slice_ext_init(struct dentry *root_dir) { }
889 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */
890