1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Restartable sequences system call 4 * 5 * Copyright (C) 2015, Google, Inc., 6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> 7 * Copyright (C) 2015-2018, EfficiOS Inc., 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 */ 10 11 /* 12 * Restartable sequences are a lightweight interface that allows 13 * user-level code to be executed atomically relative to scheduler 14 * preemption and signal delivery. Typically used for implementing 15 * per-cpu operations. 16 * 17 * It allows user-space to perform update operations on per-cpu data 18 * without requiring heavy-weight atomic operations. 19 * 20 * Detailed algorithm of rseq user-space assembly sequences: 21 * 22 * init(rseq_cs) 23 * cpu = TLS->rseq::cpu_id_start 24 * [1] TLS->rseq::rseq_cs = rseq_cs 25 * [start_ip] ---------------------------- 26 * [2] if (cpu != TLS->rseq::cpu_id) 27 * goto abort_ip; 28 * [3] <last_instruction_in_cs> 29 * [post_commit_ip] ---------------------------- 30 * 31 * The address of jump target abort_ip must be outside the critical 32 * region, i.e.: 33 * 34 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] 35 * 36 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in 37 * userspace that can handle being interrupted between any of those 38 * instructions, and then resumed to the abort_ip. 39 * 40 * 1. Userspace stores the address of the struct rseq_cs assembly 41 * block descriptor into the rseq_cs field of the registered 42 * struct rseq TLS area. This update is performed through a single 43 * store within the inline assembly instruction sequence. 44 * [start_ip] 45 * 46 * 2. Userspace tests to check whether the current cpu_id field match 47 * the cpu number loaded before start_ip, branching to abort_ip 48 * in case of a mismatch. 49 * 50 * If the sequence is preempted or interrupted by a signal 51 * at or after start_ip and before post_commit_ip, then the kernel 52 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return 53 * ip to abort_ip before returning to user-space, so the preempted 54 * execution resumes at abort_ip. 55 * 56 * 3. Userspace critical section final instruction before 57 * post_commit_ip is the commit. The critical section is 58 * self-terminating. 59 * [post_commit_ip] 60 * 61 * 4. <success> 62 * 63 * On failure at [2], or if interrupted by preempt or signal delivery 64 * between [1] and [3]: 65 * 66 * [abort_ip] 67 * F1. <failure> 68 */ 69 70 /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 71 #define RSEQ_BUILD_SLOW_PATH 72 73 #include <linux/debugfs.h> 74 #include <linux/ratelimit.h> 75 #include <linux/rseq_entry.h> 76 #include <linux/sched.h> 77 #include <linux/syscalls.h> 78 #include <linux/uaccess.h> 79 #include <linux/types.h> 80 #include <asm/ptrace.h> 81 82 #define CREATE_TRACE_POINTS 83 #include <trace/events/rseq.h> 84 85 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 86 87 static inline void rseq_control_debug(bool on) 88 { 89 if (on) 90 static_branch_enable(&rseq_debug_enabled); 91 else 92 static_branch_disable(&rseq_debug_enabled); 93 } 94 95 static int __init rseq_setup_debug(char *str) 96 { 97 bool on; 98 99 if (kstrtobool(str, &on)) 100 return -EINVAL; 101 rseq_control_debug(on); 102 return 1; 103 } 104 __setup("rseq_debug=", rseq_setup_debug); 105 106 #ifdef CONFIG_TRACEPOINTS 107 /* 108 * Out of line, so the actual update functions can be in a header to be 109 * inlined into the exit to user code. 110 */ 111 void __rseq_trace_update(struct task_struct *t) 112 { 113 trace_rseq_update(t); 114 } 115 116 void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 117 unsigned long offset, unsigned long abort_ip) 118 { 119 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 120 } 121 #endif /* CONFIG_TRACEPOINTS */ 122 123 #ifdef CONFIG_DEBUG_FS 124 #ifdef CONFIG_RSEQ_STATS 125 DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 126 127 static int rseq_stats_show(struct seq_file *m, void *p) 128 { 129 struct rseq_stats stats = { }; 130 unsigned int cpu; 131 132 for_each_possible_cpu(cpu) { 133 stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 134 stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 135 stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 136 stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); 137 stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 138 stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 139 stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 140 stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 141 } 142 143 seq_printf(m, "exit: %16lu\n", stats.exit); 144 seq_printf(m, "signal: %16lu\n", stats.signal); 145 seq_printf(m, "slowp: %16lu\n", stats.slowpath); 146 seq_printf(m, "fastp: %16lu\n", stats.fastpath); 147 seq_printf(m, "ids: %16lu\n", stats.ids); 148 seq_printf(m, "cs: %16lu\n", stats.cs); 149 seq_printf(m, "clear: %16lu\n", stats.clear); 150 seq_printf(m, "fixup: %16lu\n", stats.fixup); 151 return 0; 152 } 153 154 static int rseq_stats_open(struct inode *inode, struct file *file) 155 { 156 return single_open(file, rseq_stats_show, inode->i_private); 157 } 158 159 static const struct file_operations stat_ops = { 160 .open = rseq_stats_open, 161 .read = seq_read, 162 .llseek = seq_lseek, 163 .release = single_release, 164 }; 165 166 static int __init rseq_stats_init(struct dentry *root_dir) 167 { 168 debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); 169 return 0; 170 } 171 #else 172 static inline void rseq_stats_init(struct dentry *root_dir) { } 173 #endif /* CONFIG_RSEQ_STATS */ 174 175 static int rseq_debug_show(struct seq_file *m, void *p) 176 { 177 bool on = static_branch_unlikely(&rseq_debug_enabled); 178 179 seq_printf(m, "%d\n", on); 180 return 0; 181 } 182 183 static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, 184 size_t count, loff_t *ppos) 185 { 186 bool on; 187 188 if (kstrtobool_from_user(ubuf, count, &on)) 189 return -EINVAL; 190 191 rseq_control_debug(on); 192 return count; 193 } 194 195 static int rseq_debug_open(struct inode *inode, struct file *file) 196 { 197 return single_open(file, rseq_debug_show, inode->i_private); 198 } 199 200 static const struct file_operations debug_ops = { 201 .open = rseq_debug_open, 202 .read = seq_read, 203 .write = rseq_debug_write, 204 .llseek = seq_lseek, 205 .release = single_release, 206 }; 207 208 static int __init rseq_debugfs_init(void) 209 { 210 struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 211 212 debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); 213 rseq_stats_init(root_dir); 214 return 0; 215 } 216 __initcall(rseq_debugfs_init); 217 #endif /* CONFIG_DEBUG_FS */ 218 219 static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) 220 { 221 return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); 222 } 223 224 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 225 { 226 struct rseq __user *urseq = t->rseq.usrptr; 227 u64 csaddr; 228 229 scoped_user_read_access(urseq, efault) 230 unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 231 if (likely(!csaddr)) 232 return true; 233 return rseq_update_user_cs(t, regs, csaddr); 234 efault: 235 return false; 236 } 237 238 static void rseq_slowpath_update_usr(struct pt_regs *regs) 239 { 240 /* 241 * Preserve rseq state and user_irq state. The generic entry code 242 * clears user_irq on the way out, the non-generic entry 243 * architectures are not having user_irq. 244 */ 245 const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 246 struct task_struct *t = current; 247 struct rseq_ids ids; 248 u32 node_id; 249 bool event; 250 251 if (unlikely(t->flags & PF_EXITING)) 252 return; 253 254 rseq_stat_inc(rseq_stats.slowpath); 255 256 /* 257 * Read and clear the event pending bit first. If the task 258 * was not preempted or migrated or a signal is on the way, 259 * there is no point in doing any of the heavy lifting here 260 * on production kernels. In that case TIF_NOTIFY_RESUME 261 * was raised by some other functionality. 262 * 263 * This is correct because the read/clear operation is 264 * guarded against scheduler preemption, which makes it CPU 265 * local atomic. If the task is preempted right after 266 * re-enabling preemption then TIF_NOTIFY_RESUME is set 267 * again and this function is invoked another time _before_ 268 * the task is able to return to user mode. 269 * 270 * On a debug kernel, invoke the fixup code unconditionally 271 * with the result handed in to allow the detection of 272 * inconsistencies. 273 */ 274 scoped_guard(irq) { 275 event = t->rseq.event.sched_switch; 276 t->rseq.event.all &= evt_mask.all; 277 ids.cpu_id = task_cpu(t); 278 ids.mm_cid = task_mm_cid(t); 279 } 280 281 if (!event) 282 return; 283 284 node_id = cpu_to_node(ids.cpu_id); 285 286 if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { 287 /* 288 * Clear the errors just in case this might survive magically, but 289 * leave the rest intact. 290 */ 291 t->rseq.event.error = 0; 292 force_sig(SIGSEGV); 293 } 294 } 295 296 void __rseq_handle_slowpath(struct pt_regs *regs) 297 { 298 /* 299 * If invoked from hypervisors before entering the guest via 300 * resume_user_mode_work(), then @regs is a NULL pointer. 301 * 302 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 303 * it before returning from the ioctl() to user space when 304 * rseq_event.sched_switch is set. 305 * 306 * So it's safe to ignore here instead of pointlessly updating it 307 * in the vcpu_run() loop. 308 */ 309 if (!regs) 310 return; 311 312 rseq_slowpath_update_usr(regs); 313 } 314 315 void __rseq_signal_deliver(int sig, struct pt_regs *regs) 316 { 317 rseq_stat_inc(rseq_stats.signal); 318 /* 319 * Don't update IDs, they are handled on exit to user if 320 * necessary. The important thing is to abort a critical section of 321 * the interrupted context as after this point the instruction 322 * pointer in @regs points to the signal handler. 323 */ 324 if (unlikely(!rseq_handle_cs(current, regs))) { 325 /* 326 * Clear the errors just in case this might survive 327 * magically, but leave the rest intact. 328 */ 329 current->rseq.event.error = 0; 330 force_sigsegv(sig); 331 } 332 } 333 334 /* 335 * Terminate the process if a syscall is issued within a restartable 336 * sequence. 337 */ 338 void __rseq_debug_syscall_return(struct pt_regs *regs) 339 { 340 struct task_struct *t = current; 341 u64 csaddr; 342 343 if (!t->rseq.event.has_rseq) 344 return; 345 if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) 346 goto fail; 347 if (likely(!csaddr)) 348 return; 349 if (unlikely(csaddr >= TASK_SIZE)) 350 goto fail; 351 if (rseq_debug_update_user_cs(t, regs, csaddr)) 352 return; 353 fail: 354 force_sig(SIGSEGV); 355 } 356 357 #ifdef CONFIG_DEBUG_RSEQ 358 /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ 359 void rseq_syscall(struct pt_regs *regs) 360 { 361 __rseq_debug_syscall_return(regs); 362 } 363 #endif 364 365 static bool rseq_reset_ids(void) 366 { 367 struct rseq_ids ids = { 368 .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 369 .mm_cid = 0, 370 }; 371 372 /* 373 * If this fails, terminate it because this leaves the kernel in 374 * stupid state as exit to user space will try to fixup the ids 375 * again. 376 */ 377 if (rseq_set_ids(current, &ids, 0)) 378 return true; 379 380 force_sig(SIGSEGV); 381 return false; 382 } 383 384 /* The original rseq structure size (including padding) is 32 bytes. */ 385 #define ORIG_RSEQ_SIZE 32 386 387 /* 388 * sys_rseq - setup restartable sequences for caller thread. 389 */ 390 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 391 { 392 if (flags & RSEQ_FLAG_UNREGISTER) { 393 if (flags & ~RSEQ_FLAG_UNREGISTER) 394 return -EINVAL; 395 /* Unregister rseq for current thread. */ 396 if (current->rseq.usrptr != rseq || !current->rseq.usrptr) 397 return -EINVAL; 398 if (rseq_len != current->rseq.len) 399 return -EINVAL; 400 if (current->rseq.sig != sig) 401 return -EPERM; 402 if (!rseq_reset_ids()) 403 return -EFAULT; 404 rseq_reset(current); 405 return 0; 406 } 407 408 if (unlikely(flags)) 409 return -EINVAL; 410 411 if (current->rseq.usrptr) { 412 /* 413 * If rseq is already registered, check whether 414 * the provided address differs from the prior 415 * one. 416 */ 417 if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) 418 return -EINVAL; 419 if (current->rseq.sig != sig) 420 return -EPERM; 421 /* Already registered. */ 422 return -EBUSY; 423 } 424 425 /* 426 * If there was no rseq previously registered, ensure the provided rseq 427 * is properly aligned, as communcated to user-space through the ELF 428 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq 429 * size, the required alignment is the original struct rseq alignment. 430 * 431 * In order to be valid, rseq_len is either the original rseq size, or 432 * large enough to contain all supported fields, as communicated to 433 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 434 */ 435 if (rseq_len < ORIG_RSEQ_SIZE || 436 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 437 (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 438 rseq_len < offsetof(struct rseq, end)))) 439 return -EINVAL; 440 if (!access_ok(rseq, rseq_len)) 441 return -EFAULT; 442 443 scoped_user_write_access(rseq, efault) { 444 /* 445 * If the rseq_cs pointer is non-NULL on registration, clear it to 446 * avoid a potential segfault on return to user-space. The proper thing 447 * to do would have been to fail the registration but this would break 448 * older libcs that reuse the rseq area for new threads without 449 * clearing the fields. Don't bother reading it, just reset it. 450 */ 451 unsafe_put_user(0UL, &rseq->rseq_cs, efault); 452 /* Initialize IDs in user space */ 453 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 454 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 455 unsafe_put_user(0U, &rseq->node_id, efault); 456 unsafe_put_user(0U, &rseq->mm_cid, efault); 457 } 458 459 /* 460 * Activate the registration by setting the rseq area address, length 461 * and signature in the task struct. 462 */ 463 current->rseq.usrptr = rseq; 464 current->rseq.len = rseq_len; 465 current->rseq.sig = sig; 466 467 /* 468 * If rseq was previously inactive, and has just been 469 * registered, ensure the cpu_id_start and cpu_id fields 470 * are updated before returning to user-space. 471 */ 472 current->rseq.event.has_rseq = true; 473 rseq_force_update(); 474 return 0; 475 476 efault: 477 return -EFAULT; 478 } 479