1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Restartable sequences system call 4 * 5 * Copyright (C) 2015, Google, Inc., 6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> 7 * Copyright (C) 2015-2018, EfficiOS Inc., 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 */ 10 11 /* 12 * Restartable sequences are a lightweight interface that allows 13 * user-level code to be executed atomically relative to scheduler 14 * preemption and signal delivery. Typically used for implementing 15 * per-cpu operations. 16 * 17 * It allows user-space to perform update operations on per-cpu data 18 * without requiring heavy-weight atomic operations. 19 * 20 * Detailed algorithm of rseq user-space assembly sequences: 21 * 22 * init(rseq_cs) 23 * cpu = TLS->rseq::cpu_id_start 24 * [1] TLS->rseq::rseq_cs = rseq_cs 25 * [start_ip] ---------------------------- 26 * [2] if (cpu != TLS->rseq::cpu_id) 27 * goto abort_ip; 28 * [3] <last_instruction_in_cs> 29 * [post_commit_ip] ---------------------------- 30 * 31 * The address of jump target abort_ip must be outside the critical 32 * region, i.e.: 33 * 34 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] 35 * 36 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in 37 * userspace that can handle being interrupted between any of those 38 * instructions, and then resumed to the abort_ip. 39 * 40 * 1. Userspace stores the address of the struct rseq_cs assembly 41 * block descriptor into the rseq_cs field of the registered 42 * struct rseq TLS area. This update is performed through a single 43 * store within the inline assembly instruction sequence. 44 * [start_ip] 45 * 46 * 2. Userspace tests to check whether the current cpu_id field match 47 * the cpu number loaded before start_ip, branching to abort_ip 48 * in case of a mismatch. 49 * 50 * If the sequence is preempted or interrupted by a signal 51 * at or after start_ip and before post_commit_ip, then the kernel 52 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return 53 * ip to abort_ip before returning to user-space, so the preempted 54 * execution resumes at abort_ip. 55 * 56 * 3. Userspace critical section final instruction before 57 * post_commit_ip is the commit. The critical section is 58 * self-terminating. 59 * [post_commit_ip] 60 * 61 * 4. <success> 62 * 63 * On failure at [2], or if interrupted by preempt or signal delivery 64 * between [1] and [3]: 65 * 66 * [abort_ip] 67 * F1. <failure> 68 */ 69 70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 71 #define RSEQ_BUILD_SLOW_PATH 72 73 #include <linux/debugfs.h> 74 #include <linux/prctl.h> 75 #include <linux/ratelimit.h> 76 #include <linux/rseq_entry.h> 77 #include <linux/sched.h> 78 #include <linux/syscalls.h> 79 #include <linux/uaccess.h> 80 #include <linux/types.h> 81 #include <asm/ptrace.h> 82 83 #define CREATE_TRACE_POINTS 84 #include <trace/events/rseq.h> 85 86 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 87 88 static inline void rseq_control_debug(bool on) 89 { 90 if (on) 91 static_branch_enable(&rseq_debug_enabled); 92 else 93 static_branch_disable(&rseq_debug_enabled); 94 } 95 96 static int __init rseq_setup_debug(char *str) 97 { 98 bool on; 99 100 if (kstrtobool(str, &on)) 101 return -EINVAL; 102 rseq_control_debug(on); 103 return 1; 104 } 105 __setup("rseq_debug=", rseq_setup_debug); 106 107 #ifdef CONFIG_TRACEPOINTS 108 /* 109 * Out of line, so the actual update functions can be in a header to be 110 * inlined into the exit to user code. 111 */ 112 void __rseq_trace_update(struct task_struct *t) 113 { 114 trace_rseq_update(t); 115 } 116 117 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 118 unsigned long offset, unsigned long abort_ip) 119 { 120 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 121 } 122 #endif /* CONFIG_TRACEPOINTS */ 123 124 #ifdef CONFIG_DEBUG_FS 125 #ifdef CONFIG_RSEQ_STATS 126 DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 127 128 static int rseq_stats_show(struct seq_file *m, void *p) 129 { 130 struct rseq_stats stats = { }; 131 unsigned int cpu; 132 133 for_each_possible_cpu(cpu) { 134 stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 135 stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 136 stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 137 stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); 138 stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 139 stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 140 stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 141 stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 142 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 143 stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); 144 stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); 145 stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); 146 stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); 147 stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); 148 } 149 } 150 151 seq_printf(m, "exit: %16lu\n", stats.exit); 152 seq_printf(m, "signal: %16lu\n", stats.signal); 153 seq_printf(m, "slowp: %16lu\n", stats.slowpath); 154 seq_printf(m, "fastp: %16lu\n", stats.fastpath); 155 seq_printf(m, "ids: %16lu\n", stats.ids); 156 seq_printf(m, "cs: %16lu\n", stats.cs); 157 seq_printf(m, "clear: %16lu\n", stats.clear); 158 seq_printf(m, "fixup: %16lu\n", stats.fixup); 159 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { 160 seq_printf(m, "sgrant: %16lu\n", stats.s_granted); 161 seq_printf(m, "sexpir: %16lu\n", stats.s_expired); 162 seq_printf(m, "srevok: %16lu\n", stats.s_revoked); 163 seq_printf(m, "syield: %16lu\n", stats.s_yielded); 164 seq_printf(m, "sabort: %16lu\n", stats.s_aborted); 165 } 166 return 0; 167 } 168 169 static int rseq_stats_open(struct inode *inode, struct file *file) 170 { 171 return single_open(file, rseq_stats_show, inode->i_private); 172 } 173 174 static const struct file_operations stat_ops = { 175 .open = rseq_stats_open, 176 .read = seq_read, 177 .llseek = seq_lseek, 178 .release = single_release, 179 }; 180 181 static int __init rseq_stats_init(struct dentry *root_dir) 182 { 183 debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); 184 return 0; 185 } 186 #else 187 static inline void rseq_stats_init(struct dentry *root_dir) { } 188 #endif /* CONFIG_RSEQ_STATS */ 189 190 static int rseq_debug_show(struct seq_file *m, void *p) 191 { 192 bool on = static_branch_unlikely(&rseq_debug_enabled); 193 194 seq_printf(m, "%d\n", on); 195 return 0; 196 } 197 198 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, 199 size_t count, loff_t *ppos) 200 { 201 bool on; 202 203 if (kstrtobool_from_user(ubuf, count, &on)) 204 return -EINVAL; 205 206 rseq_control_debug(on); 207 return count; 208 } 209 210 static int rseq_debug_open(struct inode *inode, struct file *file) 211 { 212 return single_open(file, rseq_debug_show, inode->i_private); 213 } 214 215 static const struct file_operations debug_ops = { 216 .open = rseq_debug_open, 217 .read = seq_read, 218 .write = rseq_debug_write, 219 .llseek = seq_lseek, 220 .release = single_release, 221 }; 222 223 static int __init rseq_debugfs_init(void) 224 { 225 struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 226 227 debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); 228 rseq_stats_init(root_dir); 229 return 0; 230 } 231 __initcall(rseq_debugfs_init); 232 #endif /* CONFIG_DEBUG_FS */ 233 234 static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) 235 { 236 return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); 237 } 238 239 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 240 { 241 struct rseq __user *urseq = t->rseq.usrptr; 242 u64 csaddr; 243 244 scoped_user_read_access(urseq, efault) 245 unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 246 if (likely(!csaddr)) 247 return true; 248 return rseq_update_user_cs(t, regs, csaddr); 249 efault: 250 return false; 251 } 252 253 static void rseq_slowpath_update_usr(struct pt_regs *regs) 254 { 255 /* 256 * Preserve rseq state and user_irq state. The generic entry code 257 * clears user_irq on the way out, the non-generic entry 258 * architectures are not having user_irq. 259 */ 260 const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 261 struct task_struct *t = current; 262 struct rseq_ids ids; 263 u32 node_id; 264 bool event; 265 266 if (unlikely(t->flags & PF_EXITING)) 267 return; 268 269 rseq_stat_inc(rseq_stats.slowpath); 270 271 /* 272 * Read and clear the event pending bit first. If the task 273 * was not preempted or migrated or a signal is on the way, 274 * there is no point in doing any of the heavy lifting here 275 * on production kernels. In that case TIF_NOTIFY_RESUME 276 * was raised by some other functionality. 277 * 278 * This is correct because the read/clear operation is 279 * guarded against scheduler preemption, which makes it CPU 280 * local atomic. If the task is preempted right after 281 * re-enabling preemption then TIF_NOTIFY_RESUME is set 282 * again and this function is invoked another time _before_ 283 * the task is able to return to user mode. 284 * 285 * On a debug kernel, invoke the fixup code unconditionally 286 * with the result handed in to allow the detection of 287 * inconsistencies. 288 */ 289 scoped_guard(irq) { 290 event = t->rseq.event.sched_switch; 291 t->rseq.event.all &= evt_mask.all; 292 ids.cpu_id = task_cpu(t); 293 ids.mm_cid = task_mm_cid(t); 294 } 295 296 if (!event) 297 return; 298 299 node_id = cpu_to_node(ids.cpu_id); 300 301 if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { 302 /* 303 * Clear the errors just in case this might survive magically, but 304 * leave the rest intact. 305 */ 306 t->rseq.event.error = 0; 307 force_sig(SIGSEGV); 308 } 309 } 310 311 void __rseq_handle_slowpath(struct pt_regs *regs) 312 { 313 /* 314 * If invoked from hypervisors before entering the guest via 315 * resume_user_mode_work(), then @regs is a NULL pointer. 316 * 317 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 318 * it before returning from the ioctl() to user space when 319 * rseq_event.sched_switch is set. 320 * 321 * So it's safe to ignore here instead of pointlessly updating it 322 * in the vcpu_run() loop. 323 */ 324 if (!regs) 325 return; 326 327 rseq_slowpath_update_usr(regs); 328 } 329 330 void __rseq_signal_deliver(int sig, struct pt_regs *regs) 331 { 332 rseq_stat_inc(rseq_stats.signal); 333 /* 334 * Don't update IDs, they are handled on exit to user if 335 * necessary. The important thing is to abort a critical section of 336 * the interrupted context as after this point the instruction 337 * pointer in @regs points to the signal handler. 338 */ 339 if (unlikely(!rseq_handle_cs(current, regs))) { 340 /* 341 * Clear the errors just in case this might survive 342 * magically, but leave the rest intact. 343 */ 344 current->rseq.event.error = 0; 345 force_sigsegv(sig); 346 } 347 } 348 349 /* 350 * Terminate the process if a syscall is issued within a restartable 351 * sequence. 352 */ 353 void __rseq_debug_syscall_return(struct pt_regs *regs) 354 { 355 struct task_struct *t = current; 356 u64 csaddr; 357 358 if (!t->rseq.event.has_rseq) 359 return; 360 if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) 361 goto fail; 362 if (likely(!csaddr)) 363 return; 364 if (unlikely(csaddr >= TASK_SIZE)) 365 goto fail; 366 if (rseq_debug_update_user_cs(t, regs, csaddr)) 367 return; 368 fail: 369 force_sig(SIGSEGV); 370 } 371 372 #ifdef CONFIG_DEBUG_RSEQ 373 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ 374 void rseq_syscall(struct pt_regs *regs) 375 { 376 __rseq_debug_syscall_return(regs); 377 } 378 #endif 379 380 static bool rseq_reset_ids(void) 381 { 382 struct rseq_ids ids = { 383 .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 384 .mm_cid = 0, 385 }; 386 387 /* 388 * If this fails, terminate it because this leaves the kernel in 389 * stupid state as exit to user space will try to fixup the ids 390 * again. 391 */ 392 if (rseq_set_ids(current, &ids, 0)) 393 return true; 394 395 force_sig(SIGSEGV); 396 return false; 397 } 398 399 /* The original rseq structure size (including padding) is 32 bytes. */ 400 #define ORIG_RSEQ_SIZE 32 401 402 /* 403 * sys_rseq - setup restartable sequences for caller thread. 404 */ 405 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 406 { 407 u32 rseqfl = 0; 408 409 if (flags & RSEQ_FLAG_UNREGISTER) { 410 if (flags & ~RSEQ_FLAG_UNREGISTER) 411 return -EINVAL; 412 /* Unregister rseq for current thread. */ 413 if (current->rseq.usrptr != rseq || !current->rseq.usrptr) 414 return -EINVAL; 415 if (rseq_len != current->rseq.len) 416 return -EINVAL; 417 if (current->rseq.sig != sig) 418 return -EPERM; 419 if (!rseq_reset_ids()) 420 return -EFAULT; 421 rseq_reset(current); 422 return 0; 423 } 424 425 if (unlikely(flags)) 426 return -EINVAL; 427 428 if (current->rseq.usrptr) { 429 /* 430 * If rseq is already registered, check whether 431 * the provided address differs from the prior 432 * one. 433 */ 434 if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) 435 return -EINVAL; 436 if (current->rseq.sig != sig) 437 return -EPERM; 438 /* Already registered. */ 439 return -EBUSY; 440 } 441 442 /* 443 * If there was no rseq previously registered, ensure the provided rseq 444 * is properly aligned, as communcated to user-space through the ELF 445 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq 446 * size, the required alignment is the original struct rseq alignment. 447 * 448 * In order to be valid, rseq_len is either the original rseq size, or 449 * large enough to contain all supported fields, as communicated to 450 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 451 */ 452 if (rseq_len < ORIG_RSEQ_SIZE || 453 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 454 (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 455 rseq_len < offsetof(struct rseq, end)))) 456 return -EINVAL; 457 if (!access_ok(rseq, rseq_len)) 458 return -EFAULT; 459 460 if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) 461 rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 462 463 scoped_user_write_access(rseq, efault) { 464 /* 465 * If the rseq_cs pointer is non-NULL on registration, clear it to 466 * avoid a potential segfault on return to user-space. The proper thing 467 * to do would have been to fail the registration but this would break 468 * older libcs that reuse the rseq area for new threads without 469 * clearing the fields. Don't bother reading it, just reset it. 470 */ 471 unsafe_put_user(0UL, &rseq->rseq_cs, efault); 472 unsafe_put_user(rseqfl, &rseq->flags, efault); 473 /* Initialize IDs in user space */ 474 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 475 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 476 unsafe_put_user(0U, &rseq->node_id, efault); 477 unsafe_put_user(0U, &rseq->mm_cid, efault); 478 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 479 } 480 481 /* 482 * Activate the registration by setting the rseq area address, length 483 * and signature in the task struct. 484 */ 485 current->rseq.usrptr = rseq; 486 current->rseq.len = rseq_len; 487 current->rseq.sig = sig; 488 489 /* 490 * If rseq was previously inactive, and has just been 491 * registered, ensure the cpu_id_start and cpu_id fields 492 * are updated before returning to user-space. 493 */ 494 current->rseq.event.has_rseq = true; 495 rseq_force_update(); 496 return 0; 497 498 efault: 499 return -EFAULT; 500 } 501 502 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 503 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); 504 505 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) 506 { 507 switch (arg2) { 508 case PR_RSEQ_SLICE_EXTENSION_GET: 509 if (arg3) 510 return -EINVAL; 511 return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; 512 513 case PR_RSEQ_SLICE_EXTENSION_SET: { 514 u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 515 bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); 516 517 if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) 518 return -EINVAL; 519 if (!rseq_slice_extension_enabled()) 520 return -ENOTSUPP; 521 if (!current->rseq.usrptr) 522 return -ENXIO; 523 524 /* No change? */ 525 if (enable == !!current->rseq.slice.state.enabled) 526 return 0; 527 528 if (get_user(rflags, ¤t->rseq.usrptr->flags)) 529 goto die; 530 531 if (current->rseq.slice.state.enabled) 532 valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 533 534 if ((rflags & valid) != valid) 535 goto die; 536 537 rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 538 rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; 539 if (enable) 540 rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; 541 542 if (put_user(rflags, ¤t->rseq.usrptr->flags)) 543 goto die; 544 545 current->rseq.slice.state.enabled = enable; 546 return 0; 547 } 548 default: 549 return -EINVAL; 550 } 551 die: 552 force_sig(SIGSEGV); 553 return -EFAULT; 554 } 555 556 /** 557 * sys_rseq_slice_yield - yield the current processor side effect free if a 558 * task granted with a time slice extension is done with 559 * the critical work before being forced out. 560 * 561 * Return: 1 if the task successfully yielded the CPU within the granted slice. 562 * 0 if the slice extension was either never granted or was revoked by 563 * going over the granted extension, using a syscall other than this one 564 * or being scheduled out earlier due to a subsequent interrupt. 565 * 566 * The syscall does not schedule because the syscall entry work immediately 567 * relinquishes the CPU and schedules if required. 568 */ 569 SYSCALL_DEFINE0(rseq_slice_yield) 570 { 571 int yielded = !!current->rseq.slice.yielded; 572 573 current->rseq.slice.yielded = 0; 574 return yielded; 575 } 576 577 static int __init rseq_slice_cmdline(char *str) 578 { 579 bool on; 580 581 if (kstrtobool(str, &on)) 582 return 0; 583 584 if (!on) 585 static_branch_disable(&rseq_slice_extension_key); 586 return 1; 587 } 588 __setup("rseq_slice_ext=", rseq_slice_cmdline); 589 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ 590