1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Restartable sequences system call 4 * 5 * Copyright (C) 2015, Google, Inc., 6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> 7 * Copyright (C) 2015-2018, EfficiOS Inc., 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 */ 10 11 /* 12 * Restartable sequences are a lightweight interface that allows 13 * user-level code to be executed atomically relative to scheduler 14 * preemption and signal delivery. Typically used for implementing 15 * per-cpu operations. 16 * 17 * It allows user-space to perform update operations on per-cpu data 18 * without requiring heavy-weight atomic operations. 19 * 20 * Detailed algorithm of rseq user-space assembly sequences: 21 * 22 * init(rseq_cs) 23 * cpu = TLS->rseq::cpu_id_start 24 * [1] TLS->rseq::rseq_cs = rseq_cs 25 * [start_ip] ---------------------------- 26 * [2] if (cpu != TLS->rseq::cpu_id) 27 * goto abort_ip; 28 * [3] <last_instruction_in_cs> 29 * [post_commit_ip] ---------------------------- 30 * 31 * The address of jump target abort_ip must be outside the critical 32 * region, i.e.: 33 * 34 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] 35 * 36 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in 37 * userspace that can handle being interrupted between any of those 38 * instructions, and then resumed to the abort_ip. 39 * 40 * 1. Userspace stores the address of the struct rseq_cs assembly 41 * block descriptor into the rseq_cs field of the registered 42 * struct rseq TLS area. This update is performed through a single 43 * store within the inline assembly instruction sequence. 44 * [start_ip] 45 * 46 * 2. Userspace tests to check whether the current cpu_id field match 47 * the cpu number loaded before start_ip, branching to abort_ip 48 * in case of a mismatch. 49 * 50 * If the sequence is preempted or interrupted by a signal 51 * at or after start_ip and before post_commit_ip, then the kernel 52 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return 53 * ip to abort_ip before returning to user-space, so the preempted 54 * execution resumes at abort_ip. 55 * 56 * 3. Userspace critical section final instruction before 57 * post_commit_ip is the commit. The critical section is 58 * self-terminating. 59 * [post_commit_ip] 60 * 61 * 4. <success> 62 * 63 * On failure at [2], or if interrupted by preempt or signal delivery 64 * between [1] and [3]: 65 * 66 * [abort_ip] 67 * F1. <failure> 68 */ 69 70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 71 #define RSEQ_BUILD_SLOW_PATH 72 73 #include <linux/debugfs.h> 74 #include <linux/hrtimer.h> 75 #include <linux/percpu.h> 76 #include <linux/prctl.h> 77 #include <linux/ratelimit.h> 78 #include <linux/rseq_entry.h> 79 #include <linux/sched.h> 80 #include <linux/syscalls.h> 81 #include <linux/uaccess.h> 82 #include <linux/types.h> 83 #include <asm/ptrace.h> 84 85 #define CREATE_TRACE_POINTS 86 #include <trace/events/rseq.h> 87 88 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 89 90 static inline void rseq_control_debug(bool on) 91 { 92 if (on) 93 static_branch_enable(&rseq_debug_enabled); 94 else 95 static_branch_disable(&rseq_debug_enabled); 96 } 97 98 static int __init rseq_setup_debug(char *str) 99 { 100 bool on; 101 102 if (kstrtobool(str, &on)) 103 return -EINVAL; 104 rseq_control_debug(on); 105 return 1; 106 } 107 __setup("rseq_debug=", rseq_setup_debug); 108 109 #ifdef CONFIG_TRACEPOINTS 110 /* 111 * Out of line, so the actual update functions can be in a header to be 112 * inlined into the exit to user code. 113 */ 114 void __rseq_trace_update(struct task_struct *t) 115 { 116 trace_rseq_update(t); 117 } 118 119 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 120 unsigned long offset, unsigned long abort_ip) 121 { 122 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 123 } 124 #endif /* CONFIG_TRACEPOINTS */ 125 126 #ifdef CONFIG_RSEQ_STATS 127 DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 128 129 static int rseq_stats_show(struct seq_file *m, void *p) 130 { 131 struct rseq_stats stats = { }; 132 unsigned int cpu; 133 134 for_each_possible_cpu(cpu) { 135 stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 136 stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 137 stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 138 stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); 139 stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 140 stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 141 stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 142 stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 143 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 144 stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); 145 stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); 146 stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); 147 stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); 148 stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); 149 } 150 } 151 152 seq_printf(m, "exit: %16lu\n", stats.exit); 153 seq_printf(m, "signal: %16lu\n", stats.signal); 154 seq_printf(m, "slowp: %16lu\n", stats.slowpath); 155 seq_printf(m, "fastp: %16lu\n", stats.fastpath); 156 seq_printf(m, "ids: %16lu\n", stats.ids); 157 seq_printf(m, "cs: %16lu\n", stats.cs); 158 seq_printf(m, "clear: %16lu\n", stats.clear); 159 seq_printf(m, "fixup: %16lu\n", stats.fixup); 160 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 161 seq_printf(m, "sgrant: %16lu\n", stats.s_granted); 162 seq_printf(m, "sexpir: %16lu\n", stats.s_expired); 163 seq_printf(m, "srevok: %16lu\n", stats.s_revoked); 164 seq_printf(m, "syield: %16lu\n", stats.s_yielded); 165 seq_printf(m, "sabort: %16lu\n", stats.s_aborted); 166 } 167 return 0; 168 } 169 170 static int rseq_stats_open(struct inode *inode, struct file *file) 171 { 172 return single_open(file, rseq_stats_show, inode->i_private); 173 } 174 175 static const struct file_operations stat_ops = { 176 .open = rseq_stats_open, 177 .read = seq_read, 178 .llseek = seq_lseek, 179 .release = single_release, 180 }; 181 182 static int __init rseq_stats_init(struct dentry *root_dir) 183 { 184 debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); 185 return 0; 186 } 187 #else 188 static inline void rseq_stats_init(struct dentry *root_dir) { } 189 #endif /* CONFIG_RSEQ_STATS */ 190 191 static int rseq_debug_show(struct seq_file *m, void *p) 192 { 193 bool on = static_branch_unlikely(&rseq_debug_enabled); 194 195 seq_printf(m, "%d\n", on); 196 return 0; 197 } 198 199 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, 200 size_t count, loff_t *ppos) 201 { 202 bool on; 203 204 if (kstrtobool_from_user(ubuf, count, &on)) 205 return -EINVAL; 206 207 rseq_control_debug(on); 208 return count; 209 } 210 211 static int rseq_debug_open(struct inode *inode, struct file *file) 212 { 213 return single_open(file, rseq_debug_show, inode->i_private); 214 } 215 216 static const struct file_operations debug_ops = { 217 .open = rseq_debug_open, 218 .read = seq_read, 219 .write = rseq_debug_write, 220 .llseek = seq_lseek, 221 .release = single_release, 222 }; 223 224 static void rseq_slice_ext_init(struct dentry *root_dir); 225 226 static int __init rseq_debugfs_init(void) 227 { 228 struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 229 230 debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); 231 rseq_stats_init(root_dir); 232 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) 233 rseq_slice_ext_init(root_dir); 234 return 0; 235 } 236 __initcall(rseq_debugfs_init); 237 238 static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) 239 { 240 return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); 241 } 242 243 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 244 { 245 struct rseq __user *urseq = t->rseq.usrptr; 246 u64 csaddr; 247 248 scoped_user_read_access(urseq, efault) 249 unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 250 if (likely(!csaddr)) 251 return true; 252 return rseq_update_user_cs(t, regs, csaddr); 253 efault: 254 return false; 255 } 256 257 static void rseq_slowpath_update_usr(struct pt_regs *regs) 258 { 259 /* 260 * Preserve rseq state and user_irq state. The generic entry code 261 * clears user_irq on the way out, the non-generic entry 262 * architectures are not having user_irq. 263 */ 264 const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 265 struct task_struct *t = current; 266 struct rseq_ids ids; 267 u32 node_id; 268 bool event; 269 270 if (unlikely(t->flags & PF_EXITING)) 271 return; 272 273 rseq_stat_inc(rseq_stats.slowpath); 274 275 /* 276 * Read and clear the event pending bit first. If the task 277 * was not preempted or migrated or a signal is on the way, 278 * there is no point in doing any of the heavy lifting here 279 * on production kernels. In that case TIF_NOTIFY_RESUME 280 * was raised by some other functionality. 281 * 282 * This is correct because the read/clear operation is 283 * guarded against scheduler preemption, which makes it CPU 284 * local atomic. If the task is preempted right after 285 * re-enabling preemption then TIF_NOTIFY_RESUME is set 286 * again and this function is invoked another time _before_ 287 * the task is able to return to user mode. 288 * 289 * On a debug kernel, invoke the fixup code unconditionally 290 * with the result handed in to allow the detection of 291 * inconsistencies. 292 */ 293 scoped_guard(irq) { 294 event = t->rseq.event.sched_switch; 295 t->rseq.event.all &= evt_mask.all; 296 ids.cpu_id = task_cpu(t); 297 ids.mm_cid = task_mm_cid(t); 298 } 299 300 if (!event) 301 return; 302 303 node_id = cpu_to_node(ids.cpu_id); 304 305 if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { 306 /* 307 * Clear the errors just in case this might survive magically, but 308 * leave the rest intact. 309 */ 310 t->rseq.event.error = 0; 311 force_sig(SIGSEGV); 312 } 313 } 314 315 void __rseq_handle_slowpath(struct pt_regs *regs) 316 { 317 /* 318 * If invoked from hypervisors before entering the guest via 319 * resume_user_mode_work(), then @regs is a NULL pointer. 320 * 321 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 322 * it before returning from the ioctl() to user space when 323 * rseq_event.sched_switch is set. 324 * 325 * So it's safe to ignore here instead of pointlessly updating it 326 * in the vcpu_run() loop. 327 */ 328 if (!regs) 329 return; 330 331 rseq_slowpath_update_usr(regs); 332 } 333 334 void __rseq_signal_deliver(int sig, struct pt_regs *regs) 335 { 336 rseq_stat_inc(rseq_stats.signal); 337 /* 338 * Don't update IDs, they are handled on exit to user if 339 * necessary. The important thing is to abort a critical section of 340 * the interrupted context as after this point the instruction 341 * pointer in @regs points to the signal handler. 342 */ 343 if (unlikely(!rseq_handle_cs(current, regs))) { 344 /* 345 * Clear the errors just in case this might survive 346 * magically, but leave the rest intact. 347 */ 348 current->rseq.event.error = 0; 349 force_sigsegv(sig); 350 } 351 } 352 353 /* 354 * Terminate the process if a syscall is issued within a restartable 355 * sequence. 356 */ 357 void __rseq_debug_syscall_return(struct pt_regs *regs) 358 { 359 struct task_struct *t = current; 360 u64 csaddr; 361 362 if (!t->rseq.event.has_rseq) 363 return; 364 if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) 365 goto fail; 366 if (likely(!csaddr)) 367 return; 368 if (unlikely(csaddr >= TASK_SIZE)) 369 goto fail; 370 if (rseq_debug_update_user_cs(t, regs, csaddr)) 371 return; 372 fail: 373 force_sig(SIGSEGV); 374 } 375 376 #ifdef CONFIG_DEBUG_RSEQ 377 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ 378 void rseq_syscall(struct pt_regs *regs) 379 { 380 __rseq_debug_syscall_return(regs); 381 } 382 #endif 383 384 static bool rseq_reset_ids(void) 385 { 386 struct rseq_ids ids = { 387 .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 388 .mm_cid = 0, 389 }; 390 391 /* 392 * If this fails, terminate it because this leaves the kernel in 393 * stupid state as exit to user space will try to fixup the ids 394 * again. 395 */ 396 if (rseq_set_ids(current, &ids, 0)) 397 return true; 398 399 force_sig(SIGSEGV); 400 return false; 401 } 402 403 /* The original rseq structure size (including padding) is 32 bytes. */ 404 #define ORIG_RSEQ_SIZE 32 405 406 /* 407 * sys_rseq - setup restartable sequences for caller thread. 408 */ 409 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 410 { 411 u32 rseqfl = 0; 412 413 if (flags & RSEQ_FLAG_UNREGISTER) { 414 if (flags & ~RSEQ_FLAG_UNREGISTER) 415 return -EINVAL; 416 /* Unregister rseq for current thread. */ 417 if (current->rseq.usrptr != rseq || !current->rseq.usrptr) 418 return -EINVAL; 419 if (rseq_len != current->rseq.len) 420 return -EINVAL; 421 if (current->rseq.sig != sig) 422 return -EPERM; 423 if (!rseq_reset_ids()) 424 return -EFAULT; 425 rseq_reset(current); 426 return 0; 427 } 428 429 if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) 430 return -EINVAL; 431 432 if (current->rseq.usrptr) { 433 /* 434 * If rseq is already registered, check whether 435 * the provided address differs from the prior 436 * one. 437 */ 438 if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) 439 return -EINVAL; 440 if (current->rseq.sig != sig) 441 return -EPERM; 442 /* Already registered. */ 443 return -EBUSY; 444 } 445 446 /* 447 * If there was no rseq previously registered, ensure the provided rseq 448 * is properly aligned, as communcated to user-space through the ELF 449 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq 450 * size, the required alignment is the original struct rseq alignment. 451 * 452 * In order to be valid, rseq_len is either the original rseq size, or 453 * large enough to contain all supported fields, as communicated to 454 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 455 */ 456 if (rseq_len < ORIG_RSEQ_SIZE || 457 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 458 (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 459 rseq_len < offsetof(struct rseq, end)))) 460 return -EINVAL; 461 if (!access_ok(rseq, rseq_len)) 462 return -EFAULT; 463 464 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 465 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 466 if (rseq_slice_extension_enabled() && 467 (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) 468 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 469 } 470 471 scoped_user_write_access(rseq, efault) { 472 /* 473 * If the rseq_cs pointer is non-NULL on registration, clear it to 474 * avoid a potential segfault on return to user-space. The proper thing 475 * to do would have been to fail the registration but this would break 476 * older libcs that reuse the rseq area for new threads without 477 * clearing the fields. Don't bother reading it, just reset it. 478 */ 479 unsafe_put_user(0UL, &rseq->rseq_cs, efault); 480 unsafe_put_user(rseqfl, &rseq->flags, efault); 481 /* Initialize IDs in user space */ 482 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 483 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 484 unsafe_put_user(0U, &rseq->node_id, efault); 485 unsafe_put_user(0U, &rseq->mm_cid, efault); 486 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 487 } 488 489 /* 490 * Activate the registration by setting the rseq area address, length 491 * and signature in the task struct. 492 */ 493 current->rseq.usrptr = rseq; 494 current->rseq.len = rseq_len; 495 current->rseq.sig = sig; 496 497 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 498 current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); 499 #endif 500 501 /* 502 * If rseq was previously inactive, and has just been 503 * registered, ensure the cpu_id_start and cpu_id fields 504 * are updated before returning to user-space. 505 */ 506 current->rseq.event.has_rseq = true; 507 rseq_force_update(); 508 return 0; 509 510 efault: 511 return -EFAULT; 512 } 513 514 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 515 struct slice_timer { 516 struct hrtimer timer; 517 void *cookie; 518 }; 519 520 static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; 521 static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; 522 unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; 523 static DEFINE_PER_CPU(struct slice_timer, slice_timer); 524 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); 525 526 /* 527 * When the timer expires and the task is still in user space, the return 528 * from interrupt will revoke the grant and schedule. If the task already 529 * entered the kernel via a syscall and the timer fires before the syscall 530 * work was able to cancel it, then depending on the preemption model this 531 * will either reschedule on return from interrupt or in the syscall work 532 * below. 533 */ 534 static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) 535 { 536 struct slice_timer *st = container_of(tmr, struct slice_timer, timer); 537 538 /* 539 * Validate that the task which armed the timer is still on the 540 * CPU. It could have been scheduled out without canceling the 541 * timer. 542 */ 543 if (st->cookie == current && current->rseq.slice.state.granted) { 544 rseq_stat_inc(rseq_stats.s_expired); 545 set_need_resched_current(); 546 } 547 return HRTIMER_NORESTART; 548 } 549 550 bool __rseq_arm_slice_extension_timer(void) 551 { 552 struct slice_timer *st = this_cpu_ptr(&slice_timer); 553 struct task_struct *curr = current; 554 555 lockdep_assert_irqs_disabled(); 556 557 /* 558 * This check prevents a task, which got a time slice extension 559 * granted, from exceeding the maximum scheduling latency when the 560 * grant expired before going out to user space. Don't bother to 561 * clear the grant here, it will be cleaned up automatically before 562 * going out to user space after being scheduled back in. 563 */ 564 if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { 565 set_need_resched_current(); 566 return true; 567 } 568 569 /* 570 * Store the task pointer as a cookie for comparison in the timer 571 * function. This is safe as the timer is CPU local and cannot be 572 * in the expiry function at this point. 573 */ 574 st->cookie = curr; 575 hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); 576 /* Arm the syscall entry work */ 577 set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); 578 return false; 579 } 580 581 static void rseq_cancel_slice_extension_timer(void) 582 { 583 struct slice_timer *st = this_cpu_ptr(&slice_timer); 584 585 /* 586 * st->cookie can be safely read as preemption is disabled and the 587 * timer is CPU local. 588 * 589 * As this is most probably the first expiring timer, the cancel is 590 * expensive as it has to reprogram the hardware, but that's less 591 * expensive than going through a full hrtimer_interrupt() cycle 592 * for nothing. 593 * 594 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU 595 * local and once the hrtimer code disabled interrupts the timer 596 * callback cannot be running. 597 */ 598 if (st->cookie == current) 599 hrtimer_try_to_cancel(&st->timer); 600 } 601 602 static inline void rseq_slice_set_need_resched(struct task_struct *curr) 603 { 604 /* 605 * The interrupt guard is required to prevent inconsistent state in 606 * this case: 607 * 608 * set_tsk_need_resched() 609 * --> Interrupt 610 * wakeup() 611 * set_tsk_need_resched() 612 * set_preempt_need_resched() 613 * schedule_on_return() 614 * clear_tsk_need_resched() 615 * clear_preempt_need_resched() 616 * set_preempt_need_resched() <- Inconsistent state 617 * 618 * This is safe vs. a remote set of TIF_NEED_RESCHED because that 619 * only sets the already set bit and does not create inconsistent 620 * state. 621 */ 622 scoped_guard(irq) 623 set_need_resched_current(); 624 } 625 626 static void rseq_slice_validate_ctrl(u32 expected) 627 { 628 u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; 629 u32 uval; 630 631 if (get_user(uval, sctrl) || uval != expected) 632 force_sig(SIGSEGV); 633 } 634 635 /* 636 * Invoked from syscall entry if a time slice extension was granted and the 637 * kernel did not clear it before user space left the critical section. 638 * 639 * While the recommended way to relinquish the CPU side effect free is 640 * rseq_slice_yield(2), any syscall within a granted slice terminates the 641 * grant and immediately reschedules if required. This supports onion layer 642 * applications, where the code requesting the grant cannot control the 643 * code within the critical section. 644 */ 645 void rseq_syscall_enter_work(long syscall) 646 { 647 struct task_struct *curr = current; 648 struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; 649 650 clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); 651 652 if (static_branch_unlikely(&rseq_debug_enabled)) 653 rseq_slice_validate_ctrl(ctrl.all); 654 655 /* 656 * The kernel might have raced, revoked the grant and updated 657 * userspace, but kept the SLICE work set. 658 */ 659 if (!ctrl.granted) 660 return; 661 662 /* 663 * Required to stabilize the per CPU timer pointer and to make 664 * set_tsk_need_resched() correct on PREEMPT[RT] kernels. 665 * 666 * Leaving the scope will reschedule on preemption models FULL, 667 * LAZY and RT if necessary. 668 */ 669 scoped_guard(preempt) { 670 rseq_cancel_slice_extension_timer(); 671 /* 672 * Now that preemption is disabled, quickly check whether 673 * the task was already rescheduled before arriving here. 674 */ 675 if (!curr->rseq.event.sched_switch) { 676 rseq_slice_set_need_resched(curr); 677 678 if (syscall == __NR_rseq_slice_yield) { 679 rseq_stat_inc(rseq_stats.s_yielded); 680 /* Update the yielded state for syscall return */ 681 curr->rseq.slice.yielded = 1; 682 } else { 683 rseq_stat_inc(rseq_stats.s_aborted); 684 } 685 } 686 } 687 /* Reschedule on NONE/VOLUNTARY preemption models */ 688 cond_resched(); 689 690 /* Clear the grant in kernel state and user space */ 691 curr->rseq.slice.state.granted = false; 692 if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) 693 force_sig(SIGSEGV); 694 } 695 696 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) 697 { 698 switch (arg2) { 699 case PR_RSEQ_SLICE_EXTENSION_GET: 700 if (arg3) 701 return -EINVAL; 702 return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; 703 704 case PR_RSEQ_SLICE_EXTENSION_SET: { 705 u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 706 bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); 707 708 if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) 709 return -EINVAL; 710 if (!rseq_slice_extension_enabled()) 711 return -ENOTSUPP; 712 if (!current->rseq.usrptr) 713 return -ENXIO; 714 715 /* No change? */ 716 if (enable == !!current->rseq.slice.state.enabled) 717 return 0; 718 719 if (get_user(rflags, ¤t->rseq.usrptr->flags)) 720 goto die; 721 722 if (current->rseq.slice.state.enabled) 723 valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 724 725 if ((rflags & valid) != valid) 726 goto die; 727 728 rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 729 rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 730 if (enable) 731 rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 732 733 if (put_user(rflags, ¤t->rseq.usrptr->flags)) 734 goto die; 735 736 current->rseq.slice.state.enabled = enable; 737 return 0; 738 } 739 default: 740 return -EINVAL; 741 } 742 die: 743 force_sig(SIGSEGV); 744 return -EFAULT; 745 } 746 747 /** 748 * sys_rseq_slice_yield - yield the current processor side effect free if a 749 * task granted with a time slice extension is done with 750 * the critical work before being forced out. 751 * 752 * Return: 1 if the task successfully yielded the CPU within the granted slice. 753 * 0 if the slice extension was either never granted or was revoked by 754 * going over the granted extension, using a syscall other than this one 755 * or being scheduled out earlier due to a subsequent interrupt. 756 * 757 * The syscall does not schedule because the syscall entry work immediately 758 * relinquishes the CPU and schedules if required. 759 */ 760 SYSCALL_DEFINE0(rseq_slice_yield) 761 { 762 int yielded = !!current->rseq.slice.yielded; 763 764 current->rseq.slice.yielded = 0; 765 return yielded; 766 } 767 768 static int rseq_slice_ext_show(struct seq_file *m, void *p) 769 { 770 seq_printf(m, "%d\n", rseq_slice_ext_nsecs); 771 return 0; 772 } 773 774 static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, 775 size_t count, loff_t *ppos) 776 { 777 unsigned int nsecs; 778 779 if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) 780 return -EINVAL; 781 782 if (nsecs < rseq_slice_ext_nsecs_min) 783 return -ERANGE; 784 785 if (nsecs > rseq_slice_ext_nsecs_max) 786 return -ERANGE; 787 788 rseq_slice_ext_nsecs = nsecs; 789 790 return count; 791 } 792 793 static int rseq_slice_ext_open(struct inode *inode, struct file *file) 794 { 795 return single_open(file, rseq_slice_ext_show, inode->i_private); 796 } 797 798 static const struct file_operations slice_ext_ops = { 799 .open = rseq_slice_ext_open, 800 .read = seq_read, 801 .write = rseq_slice_ext_write, 802 .llseek = seq_lseek, 803 .release = single_release, 804 }; 805 806 static void rseq_slice_ext_init(struct dentry *root_dir) 807 { 808 debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); 809 } 810 811 static int __init rseq_slice_cmdline(char *str) 812 { 813 bool on; 814 815 if (kstrtobool(str, &on)) 816 return 0; 817 818 if (!on) 819 static_branch_disable(&rseq_slice_extension_key); 820 return 1; 821 } 822 __setup("rseq_slice_ext=", rseq_slice_cmdline); 823 824 static int __init rseq_slice_init(void) 825 { 826 unsigned int cpu; 827 828 for_each_possible_cpu(cpu) { 829 hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, 830 CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); 831 } 832 return 0; 833 } 834 device_initcall(rseq_slice_init); 835 #else 836 static void rseq_slice_ext_init(struct dentry *root_dir) { } 837 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ 838