xref: /linux/kernel/rseq.c (revision 23b0f90ba871f096474e1c27c3d14f455189d2d9)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Restartable sequences system call
4  *
5  * Copyright (C) 2015, Google, Inc.,
6  * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
7  * Copyright (C) 2015-2018, EfficiOS Inc.,
8  * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
9  */
10 
11 /*
12  * Restartable sequences are a lightweight interface that allows
13  * user-level code to be executed atomically relative to scheduler
14  * preemption and signal delivery. Typically used for implementing
15  * per-cpu operations.
16  *
17  * It allows user-space to perform update operations on per-cpu data
18  * without requiring heavy-weight atomic operations.
19  *
20  * Detailed algorithm of rseq user-space assembly sequences:
21  *
22  *                     init(rseq_cs)
23  *                     cpu = TLS->rseq::cpu_id_start
24  *   [1]               TLS->rseq::rseq_cs = rseq_cs
25  *   [start_ip]        ----------------------------
26  *   [2]               if (cpu != TLS->rseq::cpu_id)
27  *                             goto abort_ip;
28  *   [3]               <last_instruction_in_cs>
29  *   [post_commit_ip]  ----------------------------
30  *
31  *   The address of jump target abort_ip must be outside the critical
32  *   region, i.e.:
33  *
34  *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
35  *
36  *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
37  *   userspace that can handle being interrupted between any of those
38  *   instructions, and then resumed to the abort_ip.
39  *
40  *   1.  Userspace stores the address of the struct rseq_cs assembly
41  *       block descriptor into the rseq_cs field of the registered
42  *       struct rseq TLS area. This update is performed through a single
43  *       store within the inline assembly instruction sequence.
44  *       [start_ip]
45  *
46  *   2.  Userspace tests to check whether the current cpu_id field match
47  *       the cpu number loaded before start_ip, branching to abort_ip
48  *       in case of a mismatch.
49  *
50  *       If the sequence is preempted or interrupted by a signal
51  *       at or after start_ip and before post_commit_ip, then the kernel
52  *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
53  *       ip to abort_ip before returning to user-space, so the preempted
54  *       execution resumes at abort_ip.
55  *
56  *   3.  Userspace critical section final instruction before
57  *       post_commit_ip is the commit. The critical section is
58  *       self-terminating.
59  *       [post_commit_ip]
60  *
61  *   4.  <success>
62  *
63  *   On failure at [2], or if interrupted by preempt or signal delivery
64  *   between [1] and [3]:
65  *
66  *       [abort_ip]
67  *   F1. <failure>
68  */
69 
70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */
71 #define RSEQ_BUILD_SLOW_PATH
72 
73 #include <linux/debugfs.h>
74 #include <linux/hrtimer.h>
75 #include <linux/percpu.h>
76 #include <linux/prctl.h>
77 #include <linux/ratelimit.h>
78 #include <linux/rseq_entry.h>
79 #include <linux/sched.h>
80 #include <linux/syscalls.h>
81 #include <linux/uaccess.h>
82 #include <linux/types.h>
83 #include <asm/ptrace.h>
84 
85 #define CREATE_TRACE_POINTS
86 #include <trace/events/rseq.h>
87 
88 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
89 
90 static inline void rseq_control_debug(bool on)
91 {
92 	if (on)
93 		static_branch_enable(&rseq_debug_enabled);
94 	else
95 		static_branch_disable(&rseq_debug_enabled);
96 }
97 
98 static int __init rseq_setup_debug(char *str)
99 {
100 	bool on;
101 
102 	if (kstrtobool(str, &on))
103 		return -EINVAL;
104 	rseq_control_debug(on);
105 	return 1;
106 }
107 __setup("rseq_debug=", rseq_setup_debug);
108 
109 #ifdef CONFIG_TRACEPOINTS
110 /*
111  * Out of line, so the actual update functions can be in a header to be
112  * inlined into the exit to user code.
113  */
114 void __rseq_trace_update(struct task_struct *t)
115 {
116 	trace_rseq_update(t);
117 }
118 
119 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
120 			   unsigned long offset, unsigned long abort_ip)
121 {
122 	trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
123 }
124 #endif /* CONFIG_TRACEPOINTS */
125 
126 #ifdef CONFIG_RSEQ_STATS
127 DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
128 
129 static int rseq_stats_show(struct seq_file *m, void *p)
130 {
131 	struct rseq_stats stats = { };
132 	unsigned int cpu;
133 
134 	for_each_possible_cpu(cpu) {
135 		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
136 		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
137 		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
138 		stats.fastpath	+= data_race(per_cpu(rseq_stats.fastpath, cpu));
139 		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
140 		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
141 		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
142 		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
143 		if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
144 			stats.s_granted	+= data_race(per_cpu(rseq_stats.s_granted, cpu));
145 			stats.s_expired	+= data_race(per_cpu(rseq_stats.s_expired, cpu));
146 			stats.s_revoked	+= data_race(per_cpu(rseq_stats.s_revoked, cpu));
147 			stats.s_yielded	+= data_race(per_cpu(rseq_stats.s_yielded, cpu));
148 			stats.s_aborted	+= data_race(per_cpu(rseq_stats.s_aborted, cpu));
149 		}
150 	}
151 
152 	seq_printf(m, "exit:   %16lu\n", stats.exit);
153 	seq_printf(m, "signal: %16lu\n", stats.signal);
154 	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
155 	seq_printf(m, "fastp:  %16lu\n", stats.fastpath);
156 	seq_printf(m, "ids:    %16lu\n", stats.ids);
157 	seq_printf(m, "cs:     %16lu\n", stats.cs);
158 	seq_printf(m, "clear:  %16lu\n", stats.clear);
159 	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
160 	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
161 		seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
162 		seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
163 		seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
164 		seq_printf(m, "syield: %16lu\n", stats.s_yielded);
165 		seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
166 	}
167 	return 0;
168 }
169 
170 static int rseq_stats_open(struct inode *inode, struct file *file)
171 {
172 	return single_open(file, rseq_stats_show, inode->i_private);
173 }
174 
175 static const struct file_operations stat_ops = {
176 	.open		= rseq_stats_open,
177 	.read		= seq_read,
178 	.llseek		= seq_lseek,
179 	.release	= single_release,
180 };
181 
182 static int __init rseq_stats_init(struct dentry *root_dir)
183 {
184 	debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
185 	return 0;
186 }
187 #else
188 static inline void rseq_stats_init(struct dentry *root_dir) { }
189 #endif /* CONFIG_RSEQ_STATS */
190 
191 static int rseq_debug_show(struct seq_file *m, void *p)
192 {
193 	bool on = static_branch_unlikely(&rseq_debug_enabled);
194 
195 	seq_printf(m, "%d\n", on);
196 	return 0;
197 }
198 
199 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
200 			    size_t count, loff_t *ppos)
201 {
202 	bool on;
203 
204 	if (kstrtobool_from_user(ubuf, count, &on))
205 		return -EINVAL;
206 
207 	rseq_control_debug(on);
208 	return count;
209 }
210 
211 static int rseq_debug_open(struct inode *inode, struct file *file)
212 {
213 	return single_open(file, rseq_debug_show, inode->i_private);
214 }
215 
216 static const struct file_operations debug_ops = {
217 	.open		= rseq_debug_open,
218 	.read		= seq_read,
219 	.write		= rseq_debug_write,
220 	.llseek		= seq_lseek,
221 	.release	= single_release,
222 };
223 
224 static void rseq_slice_ext_init(struct dentry *root_dir);
225 
226 static int __init rseq_debugfs_init(void)
227 {
228 	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
229 
230 	debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
231 	rseq_stats_init(root_dir);
232 	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
233 		rseq_slice_ext_init(root_dir);
234 	return 0;
235 }
236 __initcall(rseq_debugfs_init);
237 
238 static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
239 {
240 	return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
241 }
242 
243 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
244 {
245 	struct rseq __user *urseq = t->rseq.usrptr;
246 	u64 csaddr;
247 
248 	scoped_user_read_access(urseq, efault)
249 		unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
250 	if (likely(!csaddr))
251 		return true;
252 	return rseq_update_user_cs(t, regs, csaddr);
253 efault:
254 	return false;
255 }
256 
257 static void rseq_slowpath_update_usr(struct pt_regs *regs)
258 {
259 	/*
260 	 * Preserve rseq state and user_irq state. The generic entry code
261 	 * clears user_irq on the way out, the non-generic entry
262 	 * architectures are not having user_irq.
263 	 */
264 	const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
265 	struct task_struct *t = current;
266 	struct rseq_ids ids;
267 	u32 node_id;
268 	bool event;
269 
270 	if (unlikely(t->flags & PF_EXITING))
271 		return;
272 
273 	rseq_stat_inc(rseq_stats.slowpath);
274 
275 	/*
276 	 * Read and clear the event pending bit first. If the task
277 	 * was not preempted or migrated or a signal is on the way,
278 	 * there is no point in doing any of the heavy lifting here
279 	 * on production kernels. In that case TIF_NOTIFY_RESUME
280 	 * was raised by some other functionality.
281 	 *
282 	 * This is correct because the read/clear operation is
283 	 * guarded against scheduler preemption, which makes it CPU
284 	 * local atomic. If the task is preempted right after
285 	 * re-enabling preemption then TIF_NOTIFY_RESUME is set
286 	 * again and this function is invoked another time _before_
287 	 * the task is able to return to user mode.
288 	 *
289 	 * On a debug kernel, invoke the fixup code unconditionally
290 	 * with the result handed in to allow the detection of
291 	 * inconsistencies.
292 	 */
293 	scoped_guard(irq) {
294 		event = t->rseq.event.sched_switch;
295 		t->rseq.event.all &= evt_mask.all;
296 		ids.cpu_id = task_cpu(t);
297 		ids.mm_cid = task_mm_cid(t);
298 	}
299 
300 	if (!event)
301 		return;
302 
303 	node_id = cpu_to_node(ids.cpu_id);
304 
305 	if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
306 		/*
307 		 * Clear the errors just in case this might survive magically, but
308 		 * leave the rest intact.
309 		 */
310 		t->rseq.event.error = 0;
311 		force_sig(SIGSEGV);
312 	}
313 }
314 
315 void __rseq_handle_slowpath(struct pt_regs *regs)
316 {
317 	/*
318 	 * If invoked from hypervisors before entering the guest via
319 	 * resume_user_mode_work(), then @regs is a NULL pointer.
320 	 *
321 	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
322 	 * it before returning from the ioctl() to user space when
323 	 * rseq_event.sched_switch is set.
324 	 *
325 	 * So it's safe to ignore here instead of pointlessly updating it
326 	 * in the vcpu_run() loop.
327 	 */
328 	if (!regs)
329 		return;
330 
331 	rseq_slowpath_update_usr(regs);
332 }
333 
334 void __rseq_signal_deliver(int sig, struct pt_regs *regs)
335 {
336 	rseq_stat_inc(rseq_stats.signal);
337 	/*
338 	 * Don't update IDs, they are handled on exit to user if
339 	 * necessary. The important thing is to abort a critical section of
340 	 * the interrupted context as after this point the instruction
341 	 * pointer in @regs points to the signal handler.
342 	 */
343 	if (unlikely(!rseq_handle_cs(current, regs))) {
344 		/*
345 		 * Clear the errors just in case this might survive
346 		 * magically, but leave the rest intact.
347 		 */
348 		current->rseq.event.error = 0;
349 		force_sigsegv(sig);
350 	}
351 }
352 
353 /*
354  * Terminate the process if a syscall is issued within a restartable
355  * sequence.
356  */
357 void __rseq_debug_syscall_return(struct pt_regs *regs)
358 {
359 	struct task_struct *t = current;
360 	u64 csaddr;
361 
362 	if (!t->rseq.event.has_rseq)
363 		return;
364 	if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
365 		goto fail;
366 	if (likely(!csaddr))
367 		return;
368 	if (unlikely(csaddr >= TASK_SIZE))
369 		goto fail;
370 	if (rseq_debug_update_user_cs(t, regs, csaddr))
371 		return;
372 fail:
373 	force_sig(SIGSEGV);
374 }
375 
376 #ifdef CONFIG_DEBUG_RSEQ
377 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */
378 void rseq_syscall(struct pt_regs *regs)
379 {
380 	__rseq_debug_syscall_return(regs);
381 }
382 #endif
383 
384 static bool rseq_reset_ids(void)
385 {
386 	struct rseq_ids ids = {
387 		.cpu_id		= RSEQ_CPU_ID_UNINITIALIZED,
388 		.mm_cid		= 0,
389 	};
390 
391 	/*
392 	 * If this fails, terminate it because this leaves the kernel in
393 	 * stupid state as exit to user space will try to fixup the ids
394 	 * again.
395 	 */
396 	if (rseq_set_ids(current, &ids, 0))
397 		return true;
398 
399 	force_sig(SIGSEGV);
400 	return false;
401 }
402 
403 /* The original rseq structure size (including padding) is 32 bytes. */
404 #define ORIG_RSEQ_SIZE		32
405 
406 /*
407  * sys_rseq - setup restartable sequences for caller thread.
408  */
409 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
410 {
411 	u32 rseqfl = 0;
412 
413 	if (flags & RSEQ_FLAG_UNREGISTER) {
414 		if (flags & ~RSEQ_FLAG_UNREGISTER)
415 			return -EINVAL;
416 		/* Unregister rseq for current thread. */
417 		if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
418 			return -EINVAL;
419 		if (rseq_len != current->rseq.len)
420 			return -EINVAL;
421 		if (current->rseq.sig != sig)
422 			return -EPERM;
423 		if (!rseq_reset_ids())
424 			return -EFAULT;
425 		rseq_reset(current);
426 		return 0;
427 	}
428 
429 	if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
430 		return -EINVAL;
431 
432 	if (current->rseq.usrptr) {
433 		/*
434 		 * If rseq is already registered, check whether
435 		 * the provided address differs from the prior
436 		 * one.
437 		 */
438 		if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
439 			return -EINVAL;
440 		if (current->rseq.sig != sig)
441 			return -EPERM;
442 		/* Already registered. */
443 		return -EBUSY;
444 	}
445 
446 	/*
447 	 * If there was no rseq previously registered, ensure the provided rseq
448 	 * is properly aligned, as communcated to user-space through the ELF
449 	 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
450 	 * size, the required alignment is the original struct rseq alignment.
451 	 *
452 	 * In order to be valid, rseq_len is either the original rseq size, or
453 	 * large enough to contain all supported fields, as communicated to
454 	 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
455 	 */
456 	if (rseq_len < ORIG_RSEQ_SIZE ||
457 	    (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
458 	    (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
459 					    rseq_len < offsetof(struct rseq, end))))
460 		return -EINVAL;
461 	if (!access_ok(rseq, rseq_len))
462 		return -EFAULT;
463 
464 	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
465 		rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
466 		if (rseq_slice_extension_enabled() &&
467 		    (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
468 			rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
469 	}
470 
471 	scoped_user_write_access(rseq, efault) {
472 		/*
473 		 * If the rseq_cs pointer is non-NULL on registration, clear it to
474 		 * avoid a potential segfault on return to user-space. The proper thing
475 		 * to do would have been to fail the registration but this would break
476 		 * older libcs that reuse the rseq area for new threads without
477 		 * clearing the fields. Don't bother reading it, just reset it.
478 		 */
479 		unsafe_put_user(0UL, &rseq->rseq_cs, efault);
480 		unsafe_put_user(rseqfl, &rseq->flags, efault);
481 		/* Initialize IDs in user space */
482 		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
483 		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
484 		unsafe_put_user(0U, &rseq->node_id, efault);
485 		unsafe_put_user(0U, &rseq->mm_cid, efault);
486 		unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
487 	}
488 
489 	/*
490 	 * Activate the registration by setting the rseq area address, length
491 	 * and signature in the task struct.
492 	 */
493 	current->rseq.usrptr = rseq;
494 	current->rseq.len = rseq_len;
495 	current->rseq.sig = sig;
496 
497 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
498 	current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
499 #endif
500 
501 	/*
502 	 * If rseq was previously inactive, and has just been
503 	 * registered, ensure the cpu_id_start and cpu_id fields
504 	 * are updated before returning to user-space.
505 	 */
506 	current->rseq.event.has_rseq = true;
507 	rseq_force_update();
508 	return 0;
509 
510 efault:
511 	return -EFAULT;
512 }
513 
514 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
515 struct slice_timer {
516 	struct hrtimer	timer;
517 	void		*cookie;
518 };
519 
520 static const unsigned int rseq_slice_ext_nsecs_min =  5 * NSEC_PER_USEC;
521 static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
522 unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min;
523 static DEFINE_PER_CPU(struct slice_timer, slice_timer);
524 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
525 
526 /*
527  * When the timer expires and the task is still in user space, the return
528  * from interrupt will revoke the grant and schedule. If the task already
529  * entered the kernel via a syscall and the timer fires before the syscall
530  * work was able to cancel it, then depending on the preemption model this
531  * will either reschedule on return from interrupt or in the syscall work
532  * below.
533  */
534 static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
535 {
536 	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
537 
538 	/*
539 	 * Validate that the task which armed the timer is still on the
540 	 * CPU. It could have been scheduled out without canceling the
541 	 * timer.
542 	 */
543 	if (st->cookie == current && current->rseq.slice.state.granted) {
544 		rseq_stat_inc(rseq_stats.s_expired);
545 		set_need_resched_current();
546 	}
547 	return HRTIMER_NORESTART;
548 }
549 
550 bool __rseq_arm_slice_extension_timer(void)
551 {
552 	struct slice_timer *st = this_cpu_ptr(&slice_timer);
553 	struct task_struct *curr = current;
554 
555 	lockdep_assert_irqs_disabled();
556 
557 	/*
558 	 * This check prevents a task, which got a time slice extension
559 	 * granted, from exceeding the maximum scheduling latency when the
560 	 * grant expired before going out to user space. Don't bother to
561 	 * clear the grant here, it will be cleaned up automatically before
562 	 * going out to user space after being scheduled back in.
563 	 */
564 	if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
565 		set_need_resched_current();
566 		return true;
567 	}
568 
569 	/*
570 	 * Store the task pointer as a cookie for comparison in the timer
571 	 * function. This is safe as the timer is CPU local and cannot be
572 	 * in the expiry function at this point.
573 	 */
574 	st->cookie = curr;
575 	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
576 	/* Arm the syscall entry work */
577 	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
578 	return false;
579 }
580 
581 static void rseq_cancel_slice_extension_timer(void)
582 {
583 	struct slice_timer *st = this_cpu_ptr(&slice_timer);
584 
585 	/*
586 	 * st->cookie can be safely read as preemption is disabled and the
587 	 * timer is CPU local.
588 	 *
589 	 * As this is most probably the first expiring timer, the cancel is
590 	 * expensive as it has to reprogram the hardware, but that's less
591 	 * expensive than going through a full hrtimer_interrupt() cycle
592 	 * for nothing.
593 	 *
594 	 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
595 	 * local and once the hrtimer code disabled interrupts the timer
596 	 * callback cannot be running.
597 	 */
598 	if (st->cookie == current)
599 		hrtimer_try_to_cancel(&st->timer);
600 }
601 
602 static inline void rseq_slice_set_need_resched(struct task_struct *curr)
603 {
604 	/*
605 	 * The interrupt guard is required to prevent inconsistent state in
606 	 * this case:
607 	 *
608 	 * set_tsk_need_resched()
609 	 * --> Interrupt
610 	 *       wakeup()
611 	 *        set_tsk_need_resched()
612 	 *	  set_preempt_need_resched()
613 	 *     schedule_on_return()
614 	 *        clear_tsk_need_resched()
615 	 *	  clear_preempt_need_resched()
616 	 * set_preempt_need_resched()		<- Inconsistent state
617 	 *
618 	 * This is safe vs. a remote set of TIF_NEED_RESCHED because that
619 	 * only sets the already set bit and does not create inconsistent
620 	 * state.
621 	 */
622 	scoped_guard(irq)
623 		set_need_resched_current();
624 }
625 
626 static void rseq_slice_validate_ctrl(u32 expected)
627 {
628 	u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
629 	u32 uval;
630 
631 	if (get_user(uval, sctrl) || uval != expected)
632 		force_sig(SIGSEGV);
633 }
634 
635 /*
636  * Invoked from syscall entry if a time slice extension was granted and the
637  * kernel did not clear it before user space left the critical section.
638  *
639  * While the recommended way to relinquish the CPU side effect free is
640  * rseq_slice_yield(2), any syscall within a granted slice terminates the
641  * grant and immediately reschedules if required. This supports onion layer
642  * applications, where the code requesting the grant cannot control the
643  * code within the critical section.
644  */
645 void rseq_syscall_enter_work(long syscall)
646 {
647 	struct task_struct *curr = current;
648 	struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
649 
650 	clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
651 
652 	if (static_branch_unlikely(&rseq_debug_enabled))
653 		rseq_slice_validate_ctrl(ctrl.all);
654 
655 	/*
656 	 * The kernel might have raced, revoked the grant and updated
657 	 * userspace, but kept the SLICE work set.
658 	 */
659 	if (!ctrl.granted)
660 		return;
661 
662 	/*
663 	 * Required to stabilize the per CPU timer pointer and to make
664 	 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
665 	 *
666 	 * Leaving the scope will reschedule on preemption models FULL,
667 	 * LAZY and RT if necessary.
668 	 */
669 	scoped_guard(preempt) {
670 		rseq_cancel_slice_extension_timer();
671 		/*
672 		 * Now that preemption is disabled, quickly check whether
673 		 * the task was already rescheduled before arriving here.
674 		 */
675 		if (!curr->rseq.event.sched_switch) {
676 			rseq_slice_set_need_resched(curr);
677 
678 			if (syscall == __NR_rseq_slice_yield) {
679 				rseq_stat_inc(rseq_stats.s_yielded);
680 				/* Update the yielded state for syscall return */
681 				curr->rseq.slice.yielded = 1;
682 			} else {
683 				rseq_stat_inc(rseq_stats.s_aborted);
684 			}
685 		}
686 	}
687 	/* Reschedule on NONE/VOLUNTARY preemption models */
688 	cond_resched();
689 
690 	/* Clear the grant in kernel state and user space */
691 	curr->rseq.slice.state.granted = false;
692 	if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
693 		force_sig(SIGSEGV);
694 }
695 
696 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
697 {
698 	switch (arg2) {
699 	case PR_RSEQ_SLICE_EXTENSION_GET:
700 		if (arg3)
701 			return -EINVAL;
702 		return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
703 
704 	case PR_RSEQ_SLICE_EXTENSION_SET: {
705 		u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
706 		bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
707 
708 		if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
709 			return -EINVAL;
710 		if (!rseq_slice_extension_enabled())
711 			return -ENOTSUPP;
712 		if (!current->rseq.usrptr)
713 			return -ENXIO;
714 
715 		/* No change? */
716 		if (enable == !!current->rseq.slice.state.enabled)
717 			return 0;
718 
719 		if (get_user(rflags, &current->rseq.usrptr->flags))
720 			goto die;
721 
722 		if (current->rseq.slice.state.enabled)
723 			valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
724 
725 		if ((rflags & valid) != valid)
726 			goto die;
727 
728 		rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
729 		rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
730 		if (enable)
731 			rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
732 
733 		if (put_user(rflags, &current->rseq.usrptr->flags))
734 			goto die;
735 
736 		current->rseq.slice.state.enabled = enable;
737 		return 0;
738 	}
739 	default:
740 		return -EINVAL;
741 	}
742 die:
743 	force_sig(SIGSEGV);
744 	return -EFAULT;
745 }
746 
747 /**
748  * sys_rseq_slice_yield - yield the current processor side effect free if a
749  *			  task granted with a time slice extension is done with
750  *			  the critical work before being forced out.
751  *
752  * Return: 1 if the task successfully yielded the CPU within the granted slice.
753  *         0 if the slice extension was either never granted or was revoked by
754  *	     going over the granted extension, using a syscall other than this one
755  *	     or being scheduled out earlier due to a subsequent interrupt.
756  *
757  * The syscall does not schedule because the syscall entry work immediately
758  * relinquishes the CPU and schedules if required.
759  */
760 SYSCALL_DEFINE0(rseq_slice_yield)
761 {
762 	int yielded = !!current->rseq.slice.yielded;
763 
764 	current->rseq.slice.yielded = 0;
765 	return yielded;
766 }
767 
768 static int rseq_slice_ext_show(struct seq_file *m, void *p)
769 {
770 	seq_printf(m, "%d\n", rseq_slice_ext_nsecs);
771 	return 0;
772 }
773 
774 static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf,
775 				    size_t count, loff_t *ppos)
776 {
777 	unsigned int nsecs;
778 
779 	if (kstrtouint_from_user(ubuf, count, 10, &nsecs))
780 		return -EINVAL;
781 
782 	if (nsecs < rseq_slice_ext_nsecs_min)
783 		return -ERANGE;
784 
785 	if (nsecs > rseq_slice_ext_nsecs_max)
786 		return -ERANGE;
787 
788 	rseq_slice_ext_nsecs = nsecs;
789 
790 	return count;
791 }
792 
793 static int rseq_slice_ext_open(struct inode *inode, struct file *file)
794 {
795 	return single_open(file, rseq_slice_ext_show, inode->i_private);
796 }
797 
798 static const struct file_operations slice_ext_ops = {
799 	.open		= rseq_slice_ext_open,
800 	.read		= seq_read,
801 	.write		= rseq_slice_ext_write,
802 	.llseek		= seq_lseek,
803 	.release	= single_release,
804 };
805 
806 static void rseq_slice_ext_init(struct dentry *root_dir)
807 {
808 	debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops);
809 }
810 
811 static int __init rseq_slice_cmdline(char *str)
812 {
813 	bool on;
814 
815 	if (kstrtobool(str, &on))
816 		return 0;
817 
818 	if (!on)
819 		static_branch_disable(&rseq_slice_extension_key);
820 	return 1;
821 }
822 __setup("rseq_slice_ext=", rseq_slice_cmdline);
823 
824 static int __init rseq_slice_init(void)
825 {
826 	unsigned int cpu;
827 
828 	for_each_possible_cpu(cpu) {
829 		hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
830 			      CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
831 	}
832 	return 0;
833 }
834 device_initcall(rseq_slice_init);
835 #else
836 static void rseq_slice_ext_init(struct dentry *root_dir) { }
837 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */
838