1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Restartable sequences system call 4 * 5 * Copyright (C) 2015, Google, Inc., 6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> 7 * Copyright (C) 2015-2018, EfficiOS Inc., 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 */ 10 11 /* 12 * Restartable sequences are a lightweight interface that allows 13 * user-level code to be executed atomically relative to scheduler 14 * preemption and signal delivery. Typically used for implementing 15 * per-cpu operations. 16 * 17 * It allows user-space to perform update operations on per-cpu data 18 * without requiring heavy-weight atomic operations. 19 * 20 * Detailed algorithm of rseq user-space assembly sequences: 21 * 22 * init(rseq_cs) 23 * cpu = TLS->rseq::cpu_id_start 24 * [1] TLS->rseq::rseq_cs = rseq_cs 25 * [start_ip] ---------------------------- 26 * [2] if (cpu != TLS->rseq::cpu_id) 27 * goto abort_ip; 28 * [3] <last_instruction_in_cs> 29 * [post_commit_ip] ---------------------------- 30 * 31 * The address of jump target abort_ip must be outside the critical 32 * region, i.e.: 33 * 34 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] 35 * 36 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in 37 * userspace that can handle being interrupted between any of those 38 * instructions, and then resumed to the abort_ip. 39 * 40 * 1. Userspace stores the address of the struct rseq_cs assembly 41 * block descriptor into the rseq_cs field of the registered 42 * struct rseq TLS area. This update is performed through a single 43 * store within the inline assembly instruction sequence. 44 * [start_ip] 45 * 46 * 2. Userspace tests to check whether the current cpu_id field match 47 * the cpu number loaded before start_ip, branching to abort_ip 48 * in case of a mismatch. 49 * 50 * If the sequence is preempted or interrupted by a signal 51 * at or after start_ip and before post_commit_ip, then the kernel 52 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return 53 * ip to abort_ip before returning to user-space, so the preempted 54 * execution resumes at abort_ip. 55 * 56 * 3. Userspace critical section final instruction before 57 * post_commit_ip is the commit. The critical section is 58 * self-terminating. 59 * [post_commit_ip] 60 * 61 * 4. <success> 62 * 63 * On failure at [2], or if interrupted by preempt or signal delivery 64 * between [1] and [3]: 65 * 66 * [abort_ip] 67 * F1. <failure> 68 */ 69 70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 71 #define RSEQ_BUILD_SLOW_PATH 72 73 #include <linux/debugfs.h> 74 #include <linux/hrtimer.h> 75 #include <linux/percpu.h> 76 #include <linux/prctl.h> 77 #include <linux/ratelimit.h> 78 #include <linux/rseq_entry.h> 79 #include <linux/sched.h> 80 #include <linux/syscalls.h> 81 #include <linux/uaccess.h> 82 #include <linux/types.h> 83 #include <linux/rseq.h> 84 #include <asm/ptrace.h> 85 86 #define CREATE_TRACE_POINTS 87 #include <trace/events/rseq.h> 88 89 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 90 91 static inline void rseq_control_debug(bool on) 92 { 93 if (on) 94 static_branch_enable(&rseq_debug_enabled); 95 else 96 static_branch_disable(&rseq_debug_enabled); 97 } 98 99 static int __init rseq_setup_debug(char *str) 100 { 101 bool on; 102 103 if (kstrtobool(str, &on)) 104 return -EINVAL; 105 rseq_control_debug(on); 106 return 1; 107 } 108 __setup("rseq_debug=", rseq_setup_debug); 109 110 #ifdef CONFIG_TRACEPOINTS 111 /* 112 * Out of line, so the actual update functions can be in a header to be 113 * inlined into the exit to user code. 114 */ 115 void __rseq_trace_update(struct task_struct *t) 116 { 117 trace_rseq_update(t); 118 } 119 120 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 121 unsigned long offset, unsigned long abort_ip) 122 { 123 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 124 } 125 #endif /* CONFIG_TRACEPOINTS */ 126 127 #ifdef CONFIG_RSEQ_STATS 128 DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 129 130 static int rseq_stats_show(struct seq_file *m, void *p) 131 { 132 struct rseq_stats stats = { }; 133 unsigned int cpu; 134 135 for_each_possible_cpu(cpu) { 136 stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 137 stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 138 stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 139 stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); 140 stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 141 stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 142 stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 143 stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 144 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 145 stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); 146 stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); 147 stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); 148 stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); 149 stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); 150 } 151 } 152 153 seq_printf(m, "exit: %16lu\n", stats.exit); 154 seq_printf(m, "signal: %16lu\n", stats.signal); 155 seq_printf(m, "slowp: %16lu\n", stats.slowpath); 156 seq_printf(m, "fastp: %16lu\n", stats.fastpath); 157 seq_printf(m, "ids: %16lu\n", stats.ids); 158 seq_printf(m, "cs: %16lu\n", stats.cs); 159 seq_printf(m, "clear: %16lu\n", stats.clear); 160 seq_printf(m, "fixup: %16lu\n", stats.fixup); 161 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 162 seq_printf(m, "sgrant: %16lu\n", stats.s_granted); 163 seq_printf(m, "sexpir: %16lu\n", stats.s_expired); 164 seq_printf(m, "srevok: %16lu\n", stats.s_revoked); 165 seq_printf(m, "syield: %16lu\n", stats.s_yielded); 166 seq_printf(m, "sabort: %16lu\n", stats.s_aborted); 167 } 168 return 0; 169 } 170 171 static int rseq_stats_open(struct inode *inode, struct file *file) 172 { 173 return single_open(file, rseq_stats_show, inode->i_private); 174 } 175 176 static const struct file_operations stat_ops = { 177 .open = rseq_stats_open, 178 .read = seq_read, 179 .llseek = seq_lseek, 180 .release = single_release, 181 }; 182 183 static int __init rseq_stats_init(struct dentry *root_dir) 184 { 185 debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); 186 return 0; 187 } 188 #else 189 static inline void rseq_stats_init(struct dentry *root_dir) { } 190 #endif /* CONFIG_RSEQ_STATS */ 191 192 static int rseq_debug_show(struct seq_file *m, void *p) 193 { 194 bool on = static_branch_unlikely(&rseq_debug_enabled); 195 196 seq_printf(m, "%d\n", on); 197 return 0; 198 } 199 200 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, 201 size_t count, loff_t *ppos) 202 { 203 bool on; 204 205 if (kstrtobool_from_user(ubuf, count, &on)) 206 return -EINVAL; 207 208 rseq_control_debug(on); 209 return count; 210 } 211 212 static int rseq_debug_open(struct inode *inode, struct file *file) 213 { 214 return single_open(file, rseq_debug_show, inode->i_private); 215 } 216 217 static const struct file_operations debug_ops = { 218 .open = rseq_debug_open, 219 .read = seq_read, 220 .write = rseq_debug_write, 221 .llseek = seq_lseek, 222 .release = single_release, 223 }; 224 225 static void rseq_slice_ext_init(struct dentry *root_dir); 226 227 static int __init rseq_debugfs_init(void) 228 { 229 struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 230 231 debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); 232 rseq_stats_init(root_dir); 233 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) 234 rseq_slice_ext_init(root_dir); 235 return 0; 236 } 237 __initcall(rseq_debugfs_init); 238 239 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 240 { 241 struct rseq __user *urseq = t->rseq.usrptr; 242 u64 csaddr; 243 244 scoped_user_read_access(urseq, efault) 245 unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 246 if (likely(!csaddr)) 247 return true; 248 return rseq_update_user_cs(t, regs, csaddr); 249 efault: 250 return false; 251 } 252 253 static void rseq_slowpath_update_usr(struct pt_regs *regs) 254 { 255 /* 256 * Preserve has_rseq and user_irq state. The generic entry code clears 257 * user_irq on the way out, the non-generic entry architectures are not 258 * setting user_irq. 259 */ 260 const struct rseq_event evt_mask = { 261 .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, 262 .user_irq = true, 263 }; 264 struct task_struct *t = current; 265 struct rseq_ids ids; 266 bool event; 267 268 if (unlikely(t->flags & PF_EXITING)) 269 return; 270 271 rseq_stat_inc(rseq_stats.slowpath); 272 273 /* 274 * Read and clear the event pending bit first. If the task 275 * was not preempted or migrated or a signal is on the way, 276 * there is no point in doing any of the heavy lifting here 277 * on production kernels. In that case TIF_NOTIFY_RESUME 278 * was raised by some other functionality. 279 * 280 * This is correct because the read/clear operation is 281 * guarded against scheduler preemption, which makes it CPU 282 * local atomic. If the task is preempted right after 283 * re-enabling preemption then TIF_NOTIFY_RESUME is set 284 * again and this function is invoked another time _before_ 285 * the task is able to return to user mode. 286 * 287 * On a debug kernel, invoke the fixup code unconditionally 288 * with the result handed in to allow the detection of 289 * inconsistencies. 290 */ 291 scoped_guard(irq) { 292 event = t->rseq.event.sched_switch; 293 t->rseq.event.all &= evt_mask.all; 294 ids.cpu_id = task_cpu(t); 295 ids.mm_cid = task_mm_cid(t); 296 } 297 298 if (!event) 299 return; 300 301 ids.node_id = cpu_to_node(ids.cpu_id); 302 303 if (unlikely(!rseq_update_usr(t, regs, &ids))) { 304 /* 305 * Clear the errors just in case this might survive magically, but 306 * leave the rest intact. 307 */ 308 t->rseq.event.error = 0; 309 force_sig(SIGSEGV); 310 } 311 } 312 313 void __rseq_handle_slowpath(struct pt_regs *regs) 314 { 315 /* 316 * If invoked from hypervisors before entering the guest via 317 * resume_user_mode_work(), then @regs is a NULL pointer. 318 * 319 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 320 * it before returning from the ioctl() to user space when 321 * rseq_event.sched_switch is set. 322 * 323 * So it's safe to ignore here instead of pointlessly updating it 324 * in the vcpu_run() loop. 325 */ 326 if (!regs) 327 return; 328 329 rseq_slowpath_update_usr(regs); 330 } 331 332 void __rseq_signal_deliver(int sig, struct pt_regs *regs) 333 { 334 rseq_stat_inc(rseq_stats.signal); 335 336 /* 337 * Don't update IDs yet, they are handled on exit to user if 338 * necessary. The important thing is to abort a critical section of 339 * the interrupted context as after this point the instruction 340 * pointer in @regs points to the signal handler. 341 */ 342 if (unlikely(!rseq_handle_cs(current, regs))) { 343 /* 344 * Clear the errors just in case this might survive 345 * magically, but leave the rest intact. 346 */ 347 current->rseq.event.error = 0; 348 force_sigsegv(sig); 349 } 350 351 /* 352 * In legacy mode, force the update of IDs before returning to user 353 * space to stay compatible. 354 */ 355 if (!rseq_v2(current)) 356 rseq_force_update(); 357 } 358 359 /* 360 * Terminate the process if a syscall is issued within a restartable 361 * sequence. 362 */ 363 void __rseq_debug_syscall_return(struct pt_regs *regs) 364 { 365 struct task_struct *t = current; 366 u64 csaddr; 367 368 if (!t->rseq.event.has_rseq) 369 return; 370 if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) 371 goto fail; 372 if (likely(!csaddr)) 373 return; 374 if (unlikely(csaddr >= TASK_SIZE)) 375 goto fail; 376 if (rseq_debug_update_user_cs(t, regs, csaddr)) 377 return; 378 fail: 379 force_sig(SIGSEGV); 380 } 381 382 #ifdef CONFIG_DEBUG_RSEQ 383 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ 384 void rseq_syscall(struct pt_regs *regs) 385 { 386 __rseq_debug_syscall_return(regs); 387 } 388 #endif 389 390 static bool rseq_reset_ids(void) 391 { 392 struct rseq __user *rseq = current->rseq.usrptr; 393 394 /* 395 * If this fails, terminate it because this leaves the kernel in 396 * stupid state as exit to user space will try to fixup the ids 397 * again. 398 */ 399 scoped_user_rw_access(rseq, efault) { 400 unsafe_put_user(0, &rseq->cpu_id_start, efault); 401 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 402 unsafe_put_user(0, &rseq->node_id, efault); 403 unsafe_put_user(0, &rseq->mm_cid, efault); 404 } 405 return true; 406 407 efault: 408 force_sig(SIGSEGV); 409 return false; 410 } 411 412 /* The original rseq structure size (including padding) is 32 bytes. */ 413 #define ORIG_RSEQ_SIZE 32 414 415 static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) 416 { 417 u32 rseqfl = 0; 418 u8 version = 1; 419 420 if (!access_ok(rseq, rseq_len)) 421 return -EFAULT; 422 423 /* 424 * Architectures, which use the generic IRQ entry code (at least) enable 425 * registrations with a size greater than the original v1 fixed sized 426 * @rseq_len, which has been validated already to utilize the optimized 427 * v2 ABI mode which also enables extended RSEQ features beyond MMCID. 428 */ 429 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) 430 version = 2; 431 432 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { 433 if (rseq_slice_extension_enabled()) { 434 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 435 if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) 436 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 437 } 438 } 439 440 scoped_user_write_access(rseq, efault) { 441 /* 442 * If the rseq_cs pointer is non-NULL on registration, clear it to 443 * avoid a potential segfault on return to user-space. The proper thing 444 * to do would have been to fail the registration but this would break 445 * older libcs that reuse the rseq area for new threads without 446 * clearing the fields. Don't bother reading it, just reset it. 447 */ 448 unsafe_put_user(0UL, &rseq->rseq_cs, efault); 449 unsafe_put_user(rseqfl, &rseq->flags, efault); 450 /* Initialize IDs in user space */ 451 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 452 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 453 unsafe_put_user(0U, &rseq->node_id, efault); 454 unsafe_put_user(0U, &rseq->mm_cid, efault); 455 456 /* 457 * All fields past mm_cid are only valid for non-legacy v2 458 * registrations. 459 */ 460 if (version > 1) { 461 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) 462 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 463 } 464 } 465 466 /* 467 * Activate the registration by setting the rseq area address, length 468 * and signature in the task struct. 469 */ 470 current->rseq.usrptr = rseq; 471 current->rseq.len = rseq_len; 472 current->rseq.sig = sig; 473 474 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 475 current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); 476 #endif 477 478 /* 479 * Ensure the cpu_id_start and cpu_id fields are updated before 480 * returning to user-space. 481 */ 482 current->rseq.event.has_rseq = version; 483 rseq_force_update(); 484 return 0; 485 486 efault: 487 return -EFAULT; 488 } 489 490 static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) 491 { 492 if (flags & ~RSEQ_FLAG_UNREGISTER) 493 return -EINVAL; 494 if (current->rseq.usrptr != rseq || !current->rseq.usrptr) 495 return -EINVAL; 496 if (rseq_len != current->rseq.len) 497 return -EINVAL; 498 if (current->rseq.sig != sig) 499 return -EPERM; 500 if (!rseq_reset_ids()) 501 return -EFAULT; 502 rseq_reset(current); 503 return 0; 504 } 505 506 static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) 507 { 508 /* 509 * If rseq is already registered, check whether the provided address 510 * differs from the prior one. 511 */ 512 if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) 513 return -EINVAL; 514 if (current->rseq.sig != sig) 515 return -EPERM; 516 /* Already registered. */ 517 return -EBUSY; 518 } 519 520 static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) 521 { 522 /* 523 * Ensure the provided rseq is properly aligned, as communicated to 524 * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If 525 * rseq_len is the original rseq size, the required alignment is the 526 * original struct rseq alignment. 527 * 528 * In order to be valid, rseq_len is either the original rseq size, or 529 * large enough to contain all supported fields, as communicated to 530 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 531 */ 532 if (rseq_len < ORIG_RSEQ_SIZE) 533 return false; 534 535 if (rseq_len == ORIG_RSEQ_SIZE) 536 return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); 537 538 return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && 539 rseq_len >= offsetof(struct rseq, end); 540 } 541 542 #define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) 543 544 /* 545 * sys_rseq - Register or unregister restartable sequences for the caller thread. 546 */ 547 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 548 { 549 if (flags & RSEQ_FLAG_UNREGISTER) 550 return rseq_unregister(rseq, rseq_len, flags, sig); 551 552 if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) 553 return -EINVAL; 554 555 if (current->rseq.usrptr) 556 return rseq_reregister(rseq, rseq_len, sig); 557 558 if (!rseq_length_valid(rseq, rseq_len)) 559 return -EINVAL; 560 561 return rseq_register(rseq, rseq_len, flags, sig); 562 } 563 564 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 565 struct slice_timer { 566 struct hrtimer timer; 567 void *cookie; 568 }; 569 570 static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; 571 static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; 572 unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; 573 static DEFINE_PER_CPU(struct slice_timer, slice_timer); 574 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); 575 576 /* 577 * When the timer expires and the task is still in user space, the return 578 * from interrupt will revoke the grant and schedule. If the task already 579 * entered the kernel via a syscall and the timer fires before the syscall 580 * work was able to cancel it, then depending on the preemption model this 581 * will either reschedule on return from interrupt or in the syscall work 582 * below. 583 */ 584 static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) 585 { 586 struct slice_timer *st = container_of(tmr, struct slice_timer, timer); 587 588 /* 589 * Validate that the task which armed the timer is still on the 590 * CPU. It could have been scheduled out without canceling the 591 * timer. 592 */ 593 if (st->cookie == current && current->rseq.slice.state.granted) { 594 rseq_stat_inc(rseq_stats.s_expired); 595 set_need_resched_current(); 596 } 597 return HRTIMER_NORESTART; 598 } 599 600 bool __rseq_arm_slice_extension_timer(void) 601 { 602 struct slice_timer *st = this_cpu_ptr(&slice_timer); 603 struct task_struct *curr = current; 604 605 lockdep_assert_irqs_disabled(); 606 607 /* 608 * This check prevents a task, which got a time slice extension 609 * granted, from exceeding the maximum scheduling latency when the 610 * grant expired before going out to user space. Don't bother to 611 * clear the grant here, it will be cleaned up automatically before 612 * going out to user space after being scheduled back in. 613 */ 614 if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { 615 set_need_resched_current(); 616 return true; 617 } 618 619 /* 620 * Store the task pointer as a cookie for comparison in the timer 621 * function. This is safe as the timer is CPU local and cannot be 622 * in the expiry function at this point. 623 */ 624 st->cookie = curr; 625 hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); 626 /* Arm the syscall entry work */ 627 set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); 628 return false; 629 } 630 631 static void rseq_cancel_slice_extension_timer(void) 632 { 633 struct slice_timer *st = this_cpu_ptr(&slice_timer); 634 635 /* 636 * st->cookie can be safely read as preemption is disabled and the 637 * timer is CPU local. 638 * 639 * As this is most probably the first expiring timer, the cancel is 640 * expensive as it has to reprogram the hardware, but that's less 641 * expensive than going through a full hrtimer_interrupt() cycle 642 * for nothing. 643 * 644 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU 645 * local and once the hrtimer code disabled interrupts the timer 646 * callback cannot be running. 647 */ 648 if (st->cookie == current) 649 hrtimer_try_to_cancel(&st->timer); 650 } 651 652 static inline void rseq_slice_set_need_resched(struct task_struct *curr) 653 { 654 /* 655 * The interrupt guard is required to prevent inconsistent state in 656 * this case: 657 * 658 * set_tsk_need_resched() 659 * --> Interrupt 660 * wakeup() 661 * set_tsk_need_resched() 662 * set_preempt_need_resched() 663 * schedule_on_return() 664 * clear_tsk_need_resched() 665 * clear_preempt_need_resched() 666 * set_preempt_need_resched() <- Inconsistent state 667 * 668 * This is safe vs. a remote set of TIF_NEED_RESCHED because that 669 * only sets the already set bit and does not create inconsistent 670 * state. 671 */ 672 scoped_guard(irq) 673 set_need_resched_current(); 674 } 675 676 static void rseq_slice_validate_ctrl(u32 expected) 677 { 678 u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; 679 u32 uval; 680 681 if (get_user(uval, sctrl) || uval != expected) 682 force_sig(SIGSEGV); 683 } 684 685 /* 686 * Invoked from syscall entry if a time slice extension was granted and the 687 * kernel did not clear it before user space left the critical section. 688 * 689 * While the recommended way to relinquish the CPU side effect free is 690 * rseq_slice_yield(2), any syscall within a granted slice terminates the 691 * grant and immediately reschedules if required. This supports onion layer 692 * applications, where the code requesting the grant cannot control the 693 * code within the critical section. 694 */ 695 void rseq_syscall_enter_work(long syscall) 696 { 697 struct task_struct *curr = current; 698 struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; 699 700 clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); 701 702 if (static_branch_unlikely(&rseq_debug_enabled)) 703 rseq_slice_validate_ctrl(ctrl.all); 704 705 /* 706 * The kernel might have raced, revoked the grant and updated 707 * userspace, but kept the SLICE work set. 708 */ 709 if (!ctrl.granted) 710 return; 711 712 /* 713 * Required to stabilize the per CPU timer pointer and to make 714 * set_tsk_need_resched() correct on PREEMPT[RT] kernels. 715 * 716 * Leaving the scope will reschedule on preemption models FULL, 717 * LAZY and RT if necessary. 718 */ 719 scoped_guard(preempt) { 720 rseq_cancel_slice_extension_timer(); 721 /* 722 * Now that preemption is disabled, quickly check whether 723 * the task was already rescheduled before arriving here. 724 */ 725 if (!curr->rseq.event.sched_switch) { 726 rseq_slice_set_need_resched(curr); 727 728 if (syscall == __NR_rseq_slice_yield) { 729 rseq_stat_inc(rseq_stats.s_yielded); 730 /* Update the yielded state for syscall return */ 731 curr->rseq.slice.yielded = 1; 732 } else { 733 rseq_stat_inc(rseq_stats.s_aborted); 734 } 735 } 736 } 737 /* Reschedule on NONE/VOLUNTARY preemption models */ 738 cond_resched(); 739 740 /* Clear the grant in kernel state and user space */ 741 curr->rseq.slice.state.granted = false; 742 if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) 743 force_sig(SIGSEGV); 744 } 745 746 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) 747 { 748 switch (arg2) { 749 case PR_RSEQ_SLICE_EXTENSION_GET: 750 if (arg3) 751 return -EINVAL; 752 return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; 753 754 case PR_RSEQ_SLICE_EXTENSION_SET: { 755 u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 756 bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); 757 758 if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) 759 return -EINVAL; 760 if (!rseq_slice_extension_enabled()) 761 return -ENOTSUPP; 762 if (!current->rseq.usrptr) 763 return -ENXIO; 764 if (!rseq_v2(current)) 765 return -ENOTSUPP; 766 767 /* No change? */ 768 if (enable == !!current->rseq.slice.state.enabled) 769 return 0; 770 771 if (get_user(rflags, ¤t->rseq.usrptr->flags)) 772 goto die; 773 774 if (current->rseq.slice.state.enabled) 775 valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 776 777 if ((rflags & valid) != valid) 778 goto die; 779 780 rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 781 rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 782 if (enable) 783 rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 784 785 if (put_user(rflags, ¤t->rseq.usrptr->flags)) 786 goto die; 787 788 current->rseq.slice.state.enabled = enable; 789 return 0; 790 } 791 default: 792 return -EINVAL; 793 } 794 die: 795 force_sig(SIGSEGV); 796 return -EFAULT; 797 } 798 799 /** 800 * sys_rseq_slice_yield - yield the current processor side effect free if a 801 * task granted with a time slice extension is done with 802 * the critical work before being forced out. 803 * 804 * Return: 1 if the task successfully yielded the CPU within the granted slice. 805 * 0 if the slice extension was either never granted or was revoked by 806 * going over the granted extension, using a syscall other than this one 807 * or being scheduled out earlier due to a subsequent interrupt. 808 * 809 * The syscall does not schedule because the syscall entry work immediately 810 * relinquishes the CPU and schedules if required. 811 */ 812 SYSCALL_DEFINE0(rseq_slice_yield) 813 { 814 int yielded = !!current->rseq.slice.yielded; 815 816 current->rseq.slice.yielded = 0; 817 return yielded; 818 } 819 820 static int rseq_slice_ext_show(struct seq_file *m, void *p) 821 { 822 seq_printf(m, "%d\n", rseq_slice_ext_nsecs); 823 return 0; 824 } 825 826 static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, 827 size_t count, loff_t *ppos) 828 { 829 unsigned int nsecs; 830 831 if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) 832 return -EINVAL; 833 834 if (nsecs < rseq_slice_ext_nsecs_min) 835 return -ERANGE; 836 837 if (nsecs > rseq_slice_ext_nsecs_max) 838 return -ERANGE; 839 840 rseq_slice_ext_nsecs = nsecs; 841 842 return count; 843 } 844 845 static int rseq_slice_ext_open(struct inode *inode, struct file *file) 846 { 847 return single_open(file, rseq_slice_ext_show, inode->i_private); 848 } 849 850 static const struct file_operations slice_ext_ops = { 851 .open = rseq_slice_ext_open, 852 .read = seq_read, 853 .write = rseq_slice_ext_write, 854 .llseek = seq_lseek, 855 .release = single_release, 856 }; 857 858 static void rseq_slice_ext_init(struct dentry *root_dir) 859 { 860 debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); 861 } 862 863 static int __init rseq_slice_cmdline(char *str) 864 { 865 bool on; 866 867 if (kstrtobool(str, &on)) 868 return 0; 869 870 if (!on) 871 static_branch_disable(&rseq_slice_extension_key); 872 return 1; 873 } 874 __setup("rseq_slice_ext=", rseq_slice_cmdline); 875 876 static int __init rseq_slice_init(void) 877 { 878 unsigned int cpu; 879 880 for_each_possible_cpu(cpu) { 881 hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, 882 CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); 883 } 884 return 0; 885 } 886 device_initcall(rseq_slice_init); 887 #else 888 static void rseq_slice_ext_init(struct dentry *root_dir) { } 889 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ 890