1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1995 Linus Torvalds 4 * 5 * Pentium III FXSR, SSE support 6 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * 8 * X86-64 port 9 * Andi Kleen. 10 * 11 * CPU hotplug support - ashok.raj@intel.com 12 */ 13 14 /* 15 * This file handles the architecture-dependent parts of process handling.. 16 */ 17 18 #include <linux/cpu.h> 19 #include <linux/errno.h> 20 #include <linux/sched.h> 21 #include <linux/sched/task.h> 22 #include <linux/sched/task_stack.h> 23 #include <linux/fs.h> 24 #include <linux/kernel.h> 25 #include <linux/mm.h> 26 #include <linux/elfcore.h> 27 #include <linux/smp.h> 28 #include <linux/slab.h> 29 #include <linux/user.h> 30 #include <linux/interrupt.h> 31 #include <linux/delay.h> 32 #include <linux/export.h> 33 #include <linux/ptrace.h> 34 #include <linux/notifier.h> 35 #include <linux/kprobes.h> 36 #include <linux/kdebug.h> 37 #include <linux/prctl.h> 38 #include <linux/uaccess.h> 39 #include <linux/io.h> 40 #include <linux/ftrace.h> 41 #include <linux/syscalls.h> 42 #include <linux/iommu.h> 43 44 #include <asm/processor.h> 45 #include <asm/pkru.h> 46 #include <asm/fpu/sched.h> 47 #include <asm/mmu_context.h> 48 #include <asm/prctl.h> 49 #include <asm/desc.h> 50 #include <asm/proto.h> 51 #include <asm/ia32.h> 52 #include <asm/debugreg.h> 53 #include <asm/switch_to.h> 54 #include <asm/xen/hypervisor.h> 55 #include <asm/vdso.h> 56 #include <asm/resctrl.h> 57 #include <asm/unistd.h> 58 #include <asm/fsgsbase.h> 59 #ifdef CONFIG_IA32_EMULATION 60 /* Not included via unistd.h */ 61 #include <asm/unistd_32_ia32.h> 62 #endif 63 64 #include "process.h" 65 66 /* Prints also some state that isn't saved in the pt_regs */ 67 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, 68 const char *log_lvl) 69 { 70 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 71 unsigned long d0, d1, d2, d3, d6, d7; 72 unsigned int fsindex, gsindex; 73 unsigned int ds, es; 74 75 show_iret_regs(regs, log_lvl); 76 77 if (regs->orig_ax != -1) 78 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); 79 else 80 pr_cont("\n"); 81 82 printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", 83 log_lvl, regs->ax, regs->bx, regs->cx); 84 printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", 85 log_lvl, regs->dx, regs->si, regs->di); 86 printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", 87 log_lvl, regs->bp, regs->r8, regs->r9); 88 printk("%sR10: %016lx R11: %016lx R12: %016lx\n", 89 log_lvl, regs->r10, regs->r11, regs->r12); 90 printk("%sR13: %016lx R14: %016lx R15: %016lx\n", 91 log_lvl, regs->r13, regs->r14, regs->r15); 92 93 if (mode == SHOW_REGS_SHORT) 94 return; 95 96 if (mode == SHOW_REGS_USER) { 97 rdmsrl(MSR_FS_BASE, fs); 98 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 99 printk("%sFS: %016lx GS: %016lx\n", 100 log_lvl, fs, shadowgs); 101 return; 102 } 103 104 asm("movl %%ds,%0" : "=r" (ds)); 105 asm("movl %%es,%0" : "=r" (es)); 106 asm("movl %%fs,%0" : "=r" (fsindex)); 107 asm("movl %%gs,%0" : "=r" (gsindex)); 108 109 rdmsrl(MSR_FS_BASE, fs); 110 rdmsrl(MSR_GS_BASE, gs); 111 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 112 113 cr0 = read_cr0(); 114 cr2 = read_cr2(); 115 cr3 = __read_cr3(); 116 cr4 = __read_cr4(); 117 118 printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 119 log_lvl, fs, fsindex, gs, gsindex, shadowgs); 120 printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", 121 log_lvl, regs->cs, ds, es, cr0); 122 printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", 123 log_lvl, cr2, cr3, cr4); 124 125 get_debugreg(d0, 0); 126 get_debugreg(d1, 1); 127 get_debugreg(d2, 2); 128 get_debugreg(d3, 3); 129 get_debugreg(d6, 6); 130 get_debugreg(d7, 7); 131 132 /* Only print out debug registers if they are in their non-default state. */ 133 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 134 (d6 == DR6_RESERVED) && (d7 == 0x400))) { 135 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", 136 log_lvl, d0, d1, d2); 137 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", 138 log_lvl, d3, d6, d7); 139 } 140 141 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) 142 printk("%sPKRU: %08x\n", log_lvl, read_pkru()); 143 } 144 145 void release_thread(struct task_struct *dead_task) 146 { 147 WARN_ON(dead_task->mm); 148 } 149 150 enum which_selector { 151 FS, 152 GS 153 }; 154 155 /* 156 * Out of line to be protected from kprobes and tracing. If this would be 157 * traced or probed than any access to a per CPU variable happens with 158 * the wrong GS. 159 * 160 * It is not used on Xen paravirt. When paravirt support is needed, it 161 * needs to be renamed with native_ prefix. 162 */ 163 static noinstr unsigned long __rdgsbase_inactive(void) 164 { 165 unsigned long gsbase; 166 167 lockdep_assert_irqs_disabled(); 168 169 if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { 170 native_swapgs(); 171 gsbase = rdgsbase(); 172 native_swapgs(); 173 } else { 174 instrumentation_begin(); 175 rdmsrl(MSR_KERNEL_GS_BASE, gsbase); 176 instrumentation_end(); 177 } 178 179 return gsbase; 180 } 181 182 /* 183 * Out of line to be protected from kprobes and tracing. If this would be 184 * traced or probed than any access to a per CPU variable happens with 185 * the wrong GS. 186 * 187 * It is not used on Xen paravirt. When paravirt support is needed, it 188 * needs to be renamed with native_ prefix. 189 */ 190 static noinstr void __wrgsbase_inactive(unsigned long gsbase) 191 { 192 lockdep_assert_irqs_disabled(); 193 194 if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { 195 native_swapgs(); 196 wrgsbase(gsbase); 197 native_swapgs(); 198 } else { 199 instrumentation_begin(); 200 wrmsrl(MSR_KERNEL_GS_BASE, gsbase); 201 instrumentation_end(); 202 } 203 } 204 205 /* 206 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are 207 * not available. The goal is to be reasonably fast on non-FSGSBASE systems. 208 * It's forcibly inlined because it'll generate better code and this function 209 * is hot. 210 */ 211 static __always_inline void save_base_legacy(struct task_struct *prev_p, 212 unsigned short selector, 213 enum which_selector which) 214 { 215 if (likely(selector == 0)) { 216 /* 217 * On Intel (without X86_BUG_NULL_SEG), the segment base could 218 * be the pre-existing saved base or it could be zero. On AMD 219 * (with X86_BUG_NULL_SEG), the segment base could be almost 220 * anything. 221 * 222 * This branch is very hot (it's hit twice on almost every 223 * context switch between 64-bit programs), and avoiding 224 * the RDMSR helps a lot, so we just assume that whatever 225 * value is already saved is correct. This matches historical 226 * Linux behavior, so it won't break existing applications. 227 * 228 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we 229 * report that the base is zero, it needs to actually be zero: 230 * see the corresponding logic in load_seg_legacy. 231 */ 232 } else { 233 /* 234 * If the selector is 1, 2, or 3, then the base is zero on 235 * !X86_BUG_NULL_SEG CPUs and could be anything on 236 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux 237 * has never attempted to preserve the base across context 238 * switches. 239 * 240 * If selector > 3, then it refers to a real segment, and 241 * saving the base isn't necessary. 242 */ 243 if (which == FS) 244 prev_p->thread.fsbase = 0; 245 else 246 prev_p->thread.gsbase = 0; 247 } 248 } 249 250 static __always_inline void save_fsgs(struct task_struct *task) 251 { 252 savesegment(fs, task->thread.fsindex); 253 savesegment(gs, task->thread.gsindex); 254 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 255 /* 256 * If FSGSBASE is enabled, we can't make any useful guesses 257 * about the base, and user code expects us to save the current 258 * value. Fortunately, reading the base directly is efficient. 259 */ 260 task->thread.fsbase = rdfsbase(); 261 task->thread.gsbase = __rdgsbase_inactive(); 262 } else { 263 save_base_legacy(task, task->thread.fsindex, FS); 264 save_base_legacy(task, task->thread.gsindex, GS); 265 } 266 } 267 268 /* 269 * While a process is running,current->thread.fsbase and current->thread.gsbase 270 * may not match the corresponding CPU registers (see save_base_legacy()). 271 */ 272 void current_save_fsgs(void) 273 { 274 unsigned long flags; 275 276 /* Interrupts need to be off for FSGSBASE */ 277 local_irq_save(flags); 278 save_fsgs(current); 279 local_irq_restore(flags); 280 } 281 #if IS_ENABLED(CONFIG_KVM) 282 EXPORT_SYMBOL_GPL(current_save_fsgs); 283 #endif 284 285 static __always_inline void loadseg(enum which_selector which, 286 unsigned short sel) 287 { 288 if (which == FS) 289 loadsegment(fs, sel); 290 else 291 load_gs_index(sel); 292 } 293 294 static __always_inline void load_seg_legacy(unsigned short prev_index, 295 unsigned long prev_base, 296 unsigned short next_index, 297 unsigned long next_base, 298 enum which_selector which) 299 { 300 if (likely(next_index <= 3)) { 301 /* 302 * The next task is using 64-bit TLS, is not using this 303 * segment at all, or is having fun with arcane CPU features. 304 */ 305 if (next_base == 0) { 306 /* 307 * Nasty case: on AMD CPUs, we need to forcibly zero 308 * the base. 309 */ 310 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { 311 loadseg(which, __USER_DS); 312 loadseg(which, next_index); 313 } else { 314 /* 315 * We could try to exhaustively detect cases 316 * under which we can skip the segment load, 317 * but there's really only one case that matters 318 * for performance: if both the previous and 319 * next states are fully zeroed, we can skip 320 * the load. 321 * 322 * (This assumes that prev_base == 0 has no 323 * false positives. This is the case on 324 * Intel-style CPUs.) 325 */ 326 if (likely(prev_index | next_index | prev_base)) 327 loadseg(which, next_index); 328 } 329 } else { 330 if (prev_index != next_index) 331 loadseg(which, next_index); 332 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, 333 next_base); 334 } 335 } else { 336 /* 337 * The next task is using a real segment. Loading the selector 338 * is sufficient. 339 */ 340 loadseg(which, next_index); 341 } 342 } 343 344 /* 345 * Store prev's PKRU value and load next's PKRU value if they differ. PKRU 346 * is not XSTATE managed on context switch because that would require a 347 * lookup in the task's FPU xsave buffer and require to keep that updated 348 * in various places. 349 */ 350 static __always_inline void x86_pkru_load(struct thread_struct *prev, 351 struct thread_struct *next) 352 { 353 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 354 return; 355 356 /* Stash the prev task's value: */ 357 prev->pkru = rdpkru(); 358 359 /* 360 * PKRU writes are slightly expensive. Avoid them when not 361 * strictly necessary: 362 */ 363 if (prev->pkru != next->pkru) 364 wrpkru(next->pkru); 365 } 366 367 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, 368 struct thread_struct *next) 369 { 370 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 371 /* Update the FS and GS selectors if they could have changed. */ 372 if (unlikely(prev->fsindex || next->fsindex)) 373 loadseg(FS, next->fsindex); 374 if (unlikely(prev->gsindex || next->gsindex)) 375 loadseg(GS, next->gsindex); 376 377 /* Update the bases. */ 378 wrfsbase(next->fsbase); 379 __wrgsbase_inactive(next->gsbase); 380 } else { 381 load_seg_legacy(prev->fsindex, prev->fsbase, 382 next->fsindex, next->fsbase, FS); 383 load_seg_legacy(prev->gsindex, prev->gsbase, 384 next->gsindex, next->gsbase, GS); 385 } 386 } 387 388 unsigned long x86_fsgsbase_read_task(struct task_struct *task, 389 unsigned short selector) 390 { 391 unsigned short idx = selector >> 3; 392 unsigned long base; 393 394 if (likely((selector & SEGMENT_TI_MASK) == 0)) { 395 if (unlikely(idx >= GDT_ENTRIES)) 396 return 0; 397 398 /* 399 * There are no user segments in the GDT with nonzero bases 400 * other than the TLS segments. 401 */ 402 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 403 return 0; 404 405 idx -= GDT_ENTRY_TLS_MIN; 406 base = get_desc_base(&task->thread.tls_array[idx]); 407 } else { 408 #ifdef CONFIG_MODIFY_LDT_SYSCALL 409 struct ldt_struct *ldt; 410 411 /* 412 * If performance here mattered, we could protect the LDT 413 * with RCU. This is a slow path, though, so we can just 414 * take the mutex. 415 */ 416 mutex_lock(&task->mm->context.lock); 417 ldt = task->mm->context.ldt; 418 if (unlikely(!ldt || idx >= ldt->nr_entries)) 419 base = 0; 420 else 421 base = get_desc_base(ldt->entries + idx); 422 mutex_unlock(&task->mm->context.lock); 423 #else 424 base = 0; 425 #endif 426 } 427 428 return base; 429 } 430 431 unsigned long x86_gsbase_read_cpu_inactive(void) 432 { 433 unsigned long gsbase; 434 435 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { 436 unsigned long flags; 437 438 local_irq_save(flags); 439 gsbase = __rdgsbase_inactive(); 440 local_irq_restore(flags); 441 } else { 442 rdmsrl(MSR_KERNEL_GS_BASE, gsbase); 443 } 444 445 return gsbase; 446 } 447 448 void x86_gsbase_write_cpu_inactive(unsigned long gsbase) 449 { 450 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { 451 unsigned long flags; 452 453 local_irq_save(flags); 454 __wrgsbase_inactive(gsbase); 455 local_irq_restore(flags); 456 } else { 457 wrmsrl(MSR_KERNEL_GS_BASE, gsbase); 458 } 459 } 460 461 unsigned long x86_fsbase_read_task(struct task_struct *task) 462 { 463 unsigned long fsbase; 464 465 if (task == current) 466 fsbase = x86_fsbase_read_cpu(); 467 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || 468 (task->thread.fsindex == 0)) 469 fsbase = task->thread.fsbase; 470 else 471 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); 472 473 return fsbase; 474 } 475 476 unsigned long x86_gsbase_read_task(struct task_struct *task) 477 { 478 unsigned long gsbase; 479 480 if (task == current) 481 gsbase = x86_gsbase_read_cpu_inactive(); 482 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || 483 (task->thread.gsindex == 0)) 484 gsbase = task->thread.gsbase; 485 else 486 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); 487 488 return gsbase; 489 } 490 491 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) 492 { 493 WARN_ON_ONCE(task == current); 494 495 task->thread.fsbase = fsbase; 496 } 497 498 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) 499 { 500 WARN_ON_ONCE(task == current); 501 502 task->thread.gsbase = gsbase; 503 } 504 505 static void 506 start_thread_common(struct pt_regs *regs, unsigned long new_ip, 507 unsigned long new_sp, 508 unsigned int _cs, unsigned int _ss, unsigned int _ds) 509 { 510 WARN_ON_ONCE(regs != current_pt_regs()); 511 512 if (static_cpu_has(X86_BUG_NULL_SEG)) { 513 /* Loading zero below won't clear the base. */ 514 loadsegment(fs, __USER_DS); 515 load_gs_index(__USER_DS); 516 } 517 518 loadsegment(fs, 0); 519 loadsegment(es, _ds); 520 loadsegment(ds, _ds); 521 load_gs_index(0); 522 523 regs->ip = new_ip; 524 regs->sp = new_sp; 525 regs->cs = _cs; 526 regs->ss = _ss; 527 regs->flags = X86_EFLAGS_IF; 528 } 529 530 void 531 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 532 { 533 start_thread_common(regs, new_ip, new_sp, 534 __USER_CS, __USER_DS, 0); 535 } 536 EXPORT_SYMBOL_GPL(start_thread); 537 538 #ifdef CONFIG_COMPAT 539 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) 540 { 541 start_thread_common(regs, new_ip, new_sp, 542 x32 ? __USER_CS : __USER32_CS, 543 __USER_DS, __USER_DS); 544 } 545 #endif 546 547 /* 548 * switch_to(x,y) should switch tasks from x to y. 549 * 550 * This could still be optimized: 551 * - fold all the options into a flag word and test it with a single test. 552 * - could test fs/gs bitsliced 553 * 554 * Kprobes not supported here. Set the probe on schedule instead. 555 * Function graph tracer not supported too. 556 */ 557 __no_kmsan_checks 558 __visible __notrace_funcgraph struct task_struct * 559 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 560 { 561 struct thread_struct *prev = &prev_p->thread; 562 struct thread_struct *next = &next_p->thread; 563 struct fpu *prev_fpu = &prev->fpu; 564 int cpu = smp_processor_id(); 565 566 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && 567 this_cpu_read(pcpu_hot.hardirq_stack_inuse)); 568 569 if (!test_thread_flag(TIF_NEED_FPU_LOAD)) 570 switch_fpu_prepare(prev_fpu, cpu); 571 572 /* We must save %fs and %gs before load_TLS() because 573 * %fs and %gs may be cleared by load_TLS(). 574 * 575 * (e.g. xen_load_tls()) 576 */ 577 save_fsgs(prev_p); 578 579 /* 580 * Load TLS before restoring any segments so that segment loads 581 * reference the correct GDT entries. 582 */ 583 load_TLS(next, cpu); 584 585 /* 586 * Leave lazy mode, flushing any hypercalls made here. This 587 * must be done after loading TLS entries in the GDT but before 588 * loading segments that might reference them. 589 */ 590 arch_end_context_switch(next_p); 591 592 /* Switch DS and ES. 593 * 594 * Reading them only returns the selectors, but writing them (if 595 * nonzero) loads the full descriptor from the GDT or LDT. The 596 * LDT for next is loaded in switch_mm, and the GDT is loaded 597 * above. 598 * 599 * We therefore need to write new values to the segment 600 * registers on every context switch unless both the new and old 601 * values are zero. 602 * 603 * Note that we don't need to do anything for CS and SS, as 604 * those are saved and restored as part of pt_regs. 605 */ 606 savesegment(es, prev->es); 607 if (unlikely(next->es | prev->es)) 608 loadsegment(es, next->es); 609 610 savesegment(ds, prev->ds); 611 if (unlikely(next->ds | prev->ds)) 612 loadsegment(ds, next->ds); 613 614 x86_fsgsbase_load(prev, next); 615 616 x86_pkru_load(prev, next); 617 618 /* 619 * Switch the PDA and FPU contexts. 620 */ 621 raw_cpu_write(pcpu_hot.current_task, next_p); 622 raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); 623 624 switch_fpu_finish(); 625 626 /* Reload sp0. */ 627 update_task_stack(next_p); 628 629 switch_to_extra(prev_p, next_p); 630 631 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { 632 /* 633 * AMD CPUs have a misfeature: SYSRET sets the SS selector but 634 * does not update the cached descriptor. As a result, if we 635 * do SYSRET while SS is NULL, we'll end up in user mode with 636 * SS apparently equal to __USER_DS but actually unusable. 637 * 638 * The straightforward workaround would be to fix it up just 639 * before SYSRET, but that would slow down the system call 640 * fast paths. Instead, we ensure that SS is never NULL in 641 * system call context. We do this by replacing NULL SS 642 * selectors at every context switch. SYSCALL sets up a valid 643 * SS, so the only way to get NULL is to re-enter the kernel 644 * from CPL 3 through an interrupt. Since that can't happen 645 * in the same task as a running syscall, we are guaranteed to 646 * context switch between every interrupt vector entry and a 647 * subsequent SYSRET. 648 * 649 * We read SS first because SS reads are much faster than 650 * writes. Out of caution, we force SS to __KERNEL_DS even if 651 * it previously had a different non-NULL value. 652 */ 653 unsigned short ss_sel; 654 savesegment(ss, ss_sel); 655 if (ss_sel != __KERNEL_DS) 656 loadsegment(ss, __KERNEL_DS); 657 } 658 659 /* Load the Intel cache allocation PQR MSR. */ 660 resctrl_sched_in(next_p); 661 662 return prev_p; 663 } 664 665 void set_personality_64bit(void) 666 { 667 /* inherit personality from parent */ 668 669 /* Make sure to be in 64bit mode */ 670 clear_thread_flag(TIF_ADDR32); 671 /* Pretend that this comes from a 64bit execve */ 672 task_pt_regs(current)->orig_ax = __NR_execve; 673 current_thread_info()->status &= ~TS_COMPAT; 674 if (current->mm) 675 __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); 676 677 /* TBD: overwrites user setup. Should have two bits. 678 But 64bit processes have always behaved this way, 679 so it's not too bad. The main problem is just that 680 32bit children are affected again. */ 681 current->personality &= ~READ_IMPLIES_EXEC; 682 } 683 684 static void __set_personality_x32(void) 685 { 686 #ifdef CONFIG_X86_X32_ABI 687 if (current->mm) 688 current->mm->context.flags = 0; 689 690 current->personality &= ~READ_IMPLIES_EXEC; 691 /* 692 * in_32bit_syscall() uses the presence of the x32 syscall bit 693 * flag to determine compat status. The x86 mmap() code relies on 694 * the syscall bitness so set x32 syscall bit right here to make 695 * in_32bit_syscall() work during exec(). 696 * 697 * Pretend to come from a x32 execve. 698 */ 699 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; 700 current_thread_info()->status &= ~TS_COMPAT; 701 #endif 702 } 703 704 static void __set_personality_ia32(void) 705 { 706 #ifdef CONFIG_IA32_EMULATION 707 if (current->mm) { 708 /* 709 * uprobes applied to this MM need to know this and 710 * cannot use user_64bit_mode() at that time. 711 */ 712 __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); 713 } 714 715 current->personality |= force_personality32; 716 /* Prepare the first "return" to user space */ 717 task_pt_regs(current)->orig_ax = __NR_ia32_execve; 718 current_thread_info()->status |= TS_COMPAT; 719 #endif 720 } 721 722 void set_personality_ia32(bool x32) 723 { 724 /* Make sure to be in 32bit mode */ 725 set_thread_flag(TIF_ADDR32); 726 727 if (x32) 728 __set_personality_x32(); 729 else 730 __set_personality_ia32(); 731 } 732 EXPORT_SYMBOL_GPL(set_personality_ia32); 733 734 #ifdef CONFIG_CHECKPOINT_RESTORE 735 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) 736 { 737 int ret; 738 739 ret = map_vdso_once(image, addr); 740 if (ret) 741 return ret; 742 743 return (long)image->size; 744 } 745 #endif 746 747 #ifdef CONFIG_ADDRESS_MASKING 748 749 #define LAM_U57_BITS 6 750 751 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) 752 { 753 if (!cpu_feature_enabled(X86_FEATURE_LAM)) 754 return -ENODEV; 755 756 /* PTRACE_ARCH_PRCTL */ 757 if (current->mm != mm) 758 return -EINVAL; 759 760 if (mm_valid_pasid(mm) && 761 !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) 762 return -EINVAL; 763 764 if (mmap_write_lock_killable(mm)) 765 return -EINTR; 766 767 if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { 768 mmap_write_unlock(mm); 769 return -EBUSY; 770 } 771 772 if (!nr_bits) { 773 mmap_write_unlock(mm); 774 return -EINVAL; 775 } else if (nr_bits <= LAM_U57_BITS) { 776 mm->context.lam_cr3_mask = X86_CR3_LAM_U57; 777 mm->context.untag_mask = ~GENMASK(62, 57); 778 } else { 779 mmap_write_unlock(mm); 780 return -EINVAL; 781 } 782 783 write_cr3(__read_cr3() | mm->context.lam_cr3_mask); 784 set_tlbstate_lam_mode(mm); 785 set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); 786 787 mmap_write_unlock(mm); 788 789 return 0; 790 } 791 #endif 792 793 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) 794 { 795 int ret = 0; 796 797 switch (option) { 798 case ARCH_SET_GS: { 799 if (unlikely(arg2 >= TASK_SIZE_MAX)) 800 return -EPERM; 801 802 preempt_disable(); 803 /* 804 * ARCH_SET_GS has always overwritten the index 805 * and the base. Zero is the most sensible value 806 * to put in the index, and is the only value that 807 * makes any sense if FSGSBASE is unavailable. 808 */ 809 if (task == current) { 810 loadseg(GS, 0); 811 x86_gsbase_write_cpu_inactive(arg2); 812 813 /* 814 * On non-FSGSBASE systems, save_base_legacy() expects 815 * that we also fill in thread.gsbase. 816 */ 817 task->thread.gsbase = arg2; 818 819 } else { 820 task->thread.gsindex = 0; 821 x86_gsbase_write_task(task, arg2); 822 } 823 preempt_enable(); 824 break; 825 } 826 case ARCH_SET_FS: { 827 /* 828 * Not strictly needed for %fs, but do it for symmetry 829 * with %gs 830 */ 831 if (unlikely(arg2 >= TASK_SIZE_MAX)) 832 return -EPERM; 833 834 preempt_disable(); 835 /* 836 * Set the selector to 0 for the same reason 837 * as %gs above. 838 */ 839 if (task == current) { 840 loadseg(FS, 0); 841 x86_fsbase_write_cpu(arg2); 842 843 /* 844 * On non-FSGSBASE systems, save_base_legacy() expects 845 * that we also fill in thread.fsbase. 846 */ 847 task->thread.fsbase = arg2; 848 } else { 849 task->thread.fsindex = 0; 850 x86_fsbase_write_task(task, arg2); 851 } 852 preempt_enable(); 853 break; 854 } 855 case ARCH_GET_FS: { 856 unsigned long base = x86_fsbase_read_task(task); 857 858 ret = put_user(base, (unsigned long __user *)arg2); 859 break; 860 } 861 case ARCH_GET_GS: { 862 unsigned long base = x86_gsbase_read_task(task); 863 864 ret = put_user(base, (unsigned long __user *)arg2); 865 break; 866 } 867 868 #ifdef CONFIG_CHECKPOINT_RESTORE 869 # ifdef CONFIG_X86_X32_ABI 870 case ARCH_MAP_VDSO_X32: 871 return prctl_map_vdso(&vdso_image_x32, arg2); 872 # endif 873 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 874 case ARCH_MAP_VDSO_32: 875 return prctl_map_vdso(&vdso_image_32, arg2); 876 # endif 877 case ARCH_MAP_VDSO_64: 878 return prctl_map_vdso(&vdso_image_64, arg2); 879 #endif 880 #ifdef CONFIG_ADDRESS_MASKING 881 case ARCH_GET_UNTAG_MASK: 882 return put_user(task->mm->context.untag_mask, 883 (unsigned long __user *)arg2); 884 case ARCH_ENABLE_TAGGED_ADDR: 885 return prctl_enable_tagged_addr(task->mm, arg2); 886 case ARCH_FORCE_TAGGED_SVA: 887 if (current != task) 888 return -EINVAL; 889 set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); 890 return 0; 891 case ARCH_GET_MAX_TAG_BITS: 892 if (!cpu_feature_enabled(X86_FEATURE_LAM)) 893 return put_user(0, (unsigned long __user *)arg2); 894 else 895 return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); 896 #endif 897 default: 898 ret = -EINVAL; 899 break; 900 } 901 902 return ret; 903 } 904 905 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 906 { 907 long ret; 908 909 ret = do_arch_prctl_64(current, option, arg2); 910 if (ret == -EINVAL) 911 ret = do_arch_prctl_common(option, arg2); 912 913 return ret; 914 } 915 916 #ifdef CONFIG_IA32_EMULATION 917 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 918 { 919 return do_arch_prctl_common(option, arg2); 920 } 921 #endif 922 923 unsigned long KSTK_ESP(struct task_struct *task) 924 { 925 return task_pt_regs(task)->sp; 926 } 927