1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Restartable sequences system call 4 * 5 * Copyright (C) 2015, Google, Inc., 6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> 7 * Copyright (C) 2015-2018, EfficiOS Inc., 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 */ 10 11 #include <linux/sched.h> 12 #include <linux/uaccess.h> 13 #include <linux/syscalls.h> 14 #include <linux/rseq.h> 15 #include <linux/types.h> 16 #include <linux/ratelimit.h> 17 #include <asm/ptrace.h> 18 19 #define CREATE_TRACE_POINTS 20 #include <trace/events/rseq.h> 21 22 /* The original rseq structure size (including padding) is 32 bytes. */ 23 #define ORIG_RSEQ_SIZE 32 24 25 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ 26 RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ 27 RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) 28 29 #ifdef CONFIG_DEBUG_RSEQ 30 static struct rseq *rseq_kernel_fields(struct task_struct *t) 31 { 32 return (struct rseq *) t->rseq_fields; 33 } 34 35 static int rseq_validate_ro_fields(struct task_struct *t) 36 { 37 static DEFINE_RATELIMIT_STATE(_rs, 38 DEFAULT_RATELIMIT_INTERVAL, 39 DEFAULT_RATELIMIT_BURST); 40 u32 cpu_id_start, cpu_id, node_id, mm_cid; 41 struct rseq __user *rseq = t->rseq; 42 43 /* 44 * Validate fields which are required to be read-only by 45 * user-space. 46 */ 47 if (!user_read_access_begin(rseq, t->rseq_len)) 48 goto efault; 49 unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); 50 unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); 51 unsafe_get_user(node_id, &rseq->node_id, efault_end); 52 unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); 53 user_read_access_end(); 54 55 if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || 56 cpu_id != rseq_kernel_fields(t)->cpu_id || 57 node_id != rseq_kernel_fields(t)->node_id || 58 mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { 59 60 pr_warn("Detected rseq corruption for pid: %d, name: %s\n" 61 "\tcpu_id_start: %u ?= %u\n" 62 "\tcpu_id: %u ?= %u\n" 63 "\tnode_id: %u ?= %u\n" 64 "\tmm_cid: %u ?= %u\n", 65 t->pid, t->comm, 66 cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, 67 cpu_id, rseq_kernel_fields(t)->cpu_id, 68 node_id, rseq_kernel_fields(t)->node_id, 69 mm_cid, rseq_kernel_fields(t)->mm_cid); 70 } 71 72 /* For now, only print a console warning on mismatch. */ 73 return 0; 74 75 efault_end: 76 user_read_access_end(); 77 efault: 78 return -EFAULT; 79 } 80 81 static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, 82 u32 node_id, u32 mm_cid) 83 { 84 rseq_kernel_fields(t)->cpu_id_start = cpu_id; 85 rseq_kernel_fields(t)->cpu_id = cpu_id; 86 rseq_kernel_fields(t)->node_id = node_id; 87 rseq_kernel_fields(t)->mm_cid = mm_cid; 88 } 89 #else 90 static int rseq_validate_ro_fields(struct task_struct *t) 91 { 92 return 0; 93 } 94 95 static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, 96 u32 node_id, u32 mm_cid) 97 { 98 } 99 #endif 100 101 /* 102 * 103 * Restartable sequences are a lightweight interface that allows 104 * user-level code to be executed atomically relative to scheduler 105 * preemption and signal delivery. Typically used for implementing 106 * per-cpu operations. 107 * 108 * It allows user-space to perform update operations on per-cpu data 109 * without requiring heavy-weight atomic operations. 110 * 111 * Detailed algorithm of rseq user-space assembly sequences: 112 * 113 * init(rseq_cs) 114 * cpu = TLS->rseq::cpu_id_start 115 * [1] TLS->rseq::rseq_cs = rseq_cs 116 * [start_ip] ---------------------------- 117 * [2] if (cpu != TLS->rseq::cpu_id) 118 * goto abort_ip; 119 * [3] <last_instruction_in_cs> 120 * [post_commit_ip] ---------------------------- 121 * 122 * The address of jump target abort_ip must be outside the critical 123 * region, i.e.: 124 * 125 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] 126 * 127 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in 128 * userspace that can handle being interrupted between any of those 129 * instructions, and then resumed to the abort_ip. 130 * 131 * 1. Userspace stores the address of the struct rseq_cs assembly 132 * block descriptor into the rseq_cs field of the registered 133 * struct rseq TLS area. This update is performed through a single 134 * store within the inline assembly instruction sequence. 135 * [start_ip] 136 * 137 * 2. Userspace tests to check whether the current cpu_id field match 138 * the cpu number loaded before start_ip, branching to abort_ip 139 * in case of a mismatch. 140 * 141 * If the sequence is preempted or interrupted by a signal 142 * at or after start_ip and before post_commit_ip, then the kernel 143 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return 144 * ip to abort_ip before returning to user-space, so the preempted 145 * execution resumes at abort_ip. 146 * 147 * 3. Userspace critical section final instruction before 148 * post_commit_ip is the commit. The critical section is 149 * self-terminating. 150 * [post_commit_ip] 151 * 152 * 4. <success> 153 * 154 * On failure at [2], or if interrupted by preempt or signal delivery 155 * between [1] and [3]: 156 * 157 * [abort_ip] 158 * F1. <failure> 159 */ 160 161 static int rseq_update_cpu_node_id(struct task_struct *t) 162 { 163 struct rseq __user *rseq = t->rseq; 164 u32 cpu_id = raw_smp_processor_id(); 165 u32 node_id = cpu_to_node(cpu_id); 166 u32 mm_cid = task_mm_cid(t); 167 168 /* 169 * Validate read-only rseq fields. 170 */ 171 if (rseq_validate_ro_fields(t)) 172 goto efault; 173 WARN_ON_ONCE((int) mm_cid < 0); 174 if (!user_write_access_begin(rseq, t->rseq_len)) 175 goto efault; 176 unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); 177 unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); 178 unsafe_put_user(node_id, &rseq->node_id, efault_end); 179 unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); 180 /* 181 * Additional feature fields added after ORIG_RSEQ_SIZE 182 * need to be conditionally updated only if 183 * t->rseq_len != ORIG_RSEQ_SIZE. 184 */ 185 user_write_access_end(); 186 rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); 187 trace_rseq_update(t); 188 return 0; 189 190 efault_end: 191 user_write_access_end(); 192 efault: 193 return -EFAULT; 194 } 195 196 static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) 197 { 198 u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, 199 mm_cid = 0; 200 201 /* 202 * Validate read-only rseq fields. 203 */ 204 if (rseq_validate_ro_fields(t)) 205 return -EFAULT; 206 /* 207 * Reset cpu_id_start to its initial state (0). 208 */ 209 if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) 210 return -EFAULT; 211 /* 212 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming 213 * in after unregistration can figure out that rseq needs to be 214 * registered again. 215 */ 216 if (put_user(cpu_id, &t->rseq->cpu_id)) 217 return -EFAULT; 218 /* 219 * Reset node_id to its initial state (0). 220 */ 221 if (put_user(node_id, &t->rseq->node_id)) 222 return -EFAULT; 223 /* 224 * Reset mm_cid to its initial state (0). 225 */ 226 if (put_user(mm_cid, &t->rseq->mm_cid)) 227 return -EFAULT; 228 229 rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); 230 231 /* 232 * Additional feature fields added after ORIG_RSEQ_SIZE 233 * need to be conditionally reset only if 234 * t->rseq_len != ORIG_RSEQ_SIZE. 235 */ 236 return 0; 237 } 238 239 static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 240 { 241 struct rseq_cs __user *urseq_cs; 242 u64 ptr; 243 u32 __user *usig; 244 u32 sig; 245 int ret; 246 247 #ifdef CONFIG_64BIT 248 if (get_user(ptr, &t->rseq->rseq_cs)) 249 return -EFAULT; 250 #else 251 if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) 252 return -EFAULT; 253 #endif 254 if (!ptr) { 255 memset(rseq_cs, 0, sizeof(*rseq_cs)); 256 return 0; 257 } 258 if (ptr >= TASK_SIZE) 259 return -EINVAL; 260 urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; 261 if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 262 return -EFAULT; 263 264 if (rseq_cs->start_ip >= TASK_SIZE || 265 rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || 266 rseq_cs->abort_ip >= TASK_SIZE || 267 rseq_cs->version > 0) 268 return -EINVAL; 269 /* Check for overflow. */ 270 if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) 271 return -EINVAL; 272 /* Ensure that abort_ip is not in the critical section. */ 273 if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 274 return -EINVAL; 275 276 usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); 277 ret = get_user(sig, usig); 278 if (ret) 279 return ret; 280 281 if (current->rseq_sig != sig) { 282 printk_ratelimited(KERN_WARNING 283 "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 284 sig, current->rseq_sig, current->pid, usig); 285 return -EINVAL; 286 } 287 return 0; 288 } 289 290 static bool rseq_warn_flags(const char *str, u32 flags) 291 { 292 u32 test_flags; 293 294 if (!flags) 295 return false; 296 test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; 297 if (test_flags) 298 pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); 299 test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; 300 if (test_flags) 301 pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); 302 return true; 303 } 304 305 static int rseq_need_restart(struct task_struct *t, u32 cs_flags) 306 { 307 u32 flags, event_mask; 308 int ret; 309 310 if (rseq_warn_flags("rseq_cs", cs_flags)) 311 return -EINVAL; 312 313 /* Get thread flags. */ 314 ret = get_user(flags, &t->rseq->flags); 315 if (ret) 316 return ret; 317 318 if (rseq_warn_flags("rseq", flags)) 319 return -EINVAL; 320 321 /* 322 * Load and clear event mask atomically with respect to 323 * scheduler preemption. 324 */ 325 preempt_disable(); 326 event_mask = t->rseq_event_mask; 327 t->rseq_event_mask = 0; 328 preempt_enable(); 329 330 return !!event_mask; 331 } 332 333 static int clear_rseq_cs(struct task_struct *t) 334 { 335 /* 336 * The rseq_cs field is set to NULL on preemption or signal 337 * delivery on top of rseq assembly block, as well as on top 338 * of code outside of the rseq assembly block. This performs 339 * a lazy clear of the rseq_cs field. 340 * 341 * Set rseq_cs to NULL. 342 */ 343 #ifdef CONFIG_64BIT 344 return put_user(0UL, &t->rseq->rseq_cs); 345 #else 346 if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) 347 return -EFAULT; 348 return 0; 349 #endif 350 } 351 352 /* 353 * Unsigned comparison will be true when ip >= start_ip, and when 354 * ip < start_ip + post_commit_offset. 355 */ 356 static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) 357 { 358 return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 359 } 360 361 static int rseq_ip_fixup(struct pt_regs *regs) 362 { 363 unsigned long ip = instruction_pointer(regs); 364 struct task_struct *t = current; 365 struct rseq_cs rseq_cs; 366 int ret; 367 368 ret = rseq_get_rseq_cs(t, &rseq_cs); 369 if (ret) 370 return ret; 371 372 /* 373 * Handle potentially not being within a critical section. 374 * If not nested over a rseq critical section, restart is useless. 375 * Clear the rseq_cs pointer and return. 376 */ 377 if (!in_rseq_cs(ip, &rseq_cs)) 378 return clear_rseq_cs(t); 379 ret = rseq_need_restart(t, rseq_cs.flags); 380 if (ret <= 0) 381 return ret; 382 ret = clear_rseq_cs(t); 383 if (ret) 384 return ret; 385 trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, 386 rseq_cs.abort_ip); 387 instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); 388 return 0; 389 } 390 391 /* 392 * This resume handler must always be executed between any of: 393 * - preemption, 394 * - signal delivery, 395 * and return to user-space. 396 * 397 * This is how we can ensure that the entire rseq critical section 398 * will issue the commit instruction only if executed atomically with 399 * respect to other threads scheduled on the same CPU, and with respect 400 * to signal handlers. 401 */ 402 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) 403 { 404 struct task_struct *t = current; 405 int ret, sig; 406 407 if (unlikely(t->flags & PF_EXITING)) 408 return; 409 410 /* 411 * regs is NULL if and only if the caller is in a syscall path. Skip 412 * fixup and leave rseq_cs as is so that rseq_sycall() will detect and 413 * kill a misbehaving userspace on debug kernels. 414 */ 415 if (regs) { 416 ret = rseq_ip_fixup(regs); 417 if (unlikely(ret < 0)) 418 goto error; 419 } 420 if (unlikely(rseq_update_cpu_node_id(t))) 421 goto error; 422 return; 423 424 error: 425 sig = ksig ? ksig->sig : 0; 426 force_sigsegv(sig); 427 } 428 429 #ifdef CONFIG_DEBUG_RSEQ 430 431 /* 432 * Terminate the process if a syscall is issued within a restartable 433 * sequence. 434 */ 435 void rseq_syscall(struct pt_regs *regs) 436 { 437 unsigned long ip = instruction_pointer(regs); 438 struct task_struct *t = current; 439 struct rseq_cs rseq_cs; 440 441 if (!t->rseq) 442 return; 443 if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) 444 force_sig(SIGSEGV); 445 } 446 447 #endif 448 449 /* 450 * sys_rseq - setup restartable sequences for caller thread. 451 */ 452 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, 453 int, flags, u32, sig) 454 { 455 int ret; 456 457 if (flags & RSEQ_FLAG_UNREGISTER) { 458 if (flags & ~RSEQ_FLAG_UNREGISTER) 459 return -EINVAL; 460 /* Unregister rseq for current thread. */ 461 if (current->rseq != rseq || !current->rseq) 462 return -EINVAL; 463 if (rseq_len != current->rseq_len) 464 return -EINVAL; 465 if (current->rseq_sig != sig) 466 return -EPERM; 467 ret = rseq_reset_rseq_cpu_node_id(current); 468 if (ret) 469 return ret; 470 current->rseq = NULL; 471 current->rseq_sig = 0; 472 current->rseq_len = 0; 473 return 0; 474 } 475 476 if (unlikely(flags)) 477 return -EINVAL; 478 479 if (current->rseq) { 480 /* 481 * If rseq is already registered, check whether 482 * the provided address differs from the prior 483 * one. 484 */ 485 if (current->rseq != rseq || rseq_len != current->rseq_len) 486 return -EINVAL; 487 if (current->rseq_sig != sig) 488 return -EPERM; 489 /* Already registered. */ 490 return -EBUSY; 491 } 492 493 /* 494 * If there was no rseq previously registered, ensure the provided rseq 495 * is properly aligned, as communcated to user-space through the ELF 496 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq 497 * size, the required alignment is the original struct rseq alignment. 498 * 499 * In order to be valid, rseq_len is either the original rseq size, or 500 * large enough to contain all supported fields, as communicated to 501 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 502 */ 503 if (rseq_len < ORIG_RSEQ_SIZE || 504 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 505 (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 506 rseq_len < offsetof(struct rseq, end)))) 507 return -EINVAL; 508 if (!access_ok(rseq, rseq_len)) 509 return -EFAULT; 510 current->rseq = rseq; 511 current->rseq_len = rseq_len; 512 current->rseq_sig = sig; 513 #ifdef CONFIG_DEBUG_RSEQ 514 /* 515 * Initialize the in-kernel rseq fields copy for validation of 516 * read-only fields. 517 */ 518 if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || 519 get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || 520 get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || 521 get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) 522 return -EFAULT; 523 #endif 524 /* 525 * If rseq was previously inactive, and has just been 526 * registered, ensure the cpu_id_start and cpu_id fields 527 * are updated before returning to user-space. 528 */ 529 rseq_set_notify_resume(current); 530 531 return 0; 532 } 533