1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1995 Linus Torvalds 4 * 5 * Pentium III FXSR, SSE support 6 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * 8 * X86-64 port 9 * Andi Kleen. 10 * 11 * CPU hotplug support - ashok.raj@intel.com 12 */ 13 14 /* 15 * This file handles the architecture-dependent parts of process handling.. 16 */ 17 18 #include <linux/cpu.h> 19 #include <linux/errno.h> 20 #include <linux/sched.h> 21 #include <linux/sched/task.h> 22 #include <linux/sched/task_stack.h> 23 #include <linux/fs.h> 24 #include <linux/kernel.h> 25 #include <linux/mm.h> 26 #include <linux/elfcore.h> 27 #include <linux/smp.h> 28 #include <linux/slab.h> 29 #include <linux/user.h> 30 #include <linux/interrupt.h> 31 #include <linux/delay.h> 32 #include <linux/export.h> 33 #include <linux/ptrace.h> 34 #include <linux/notifier.h> 35 #include <linux/kprobes.h> 36 #include <linux/kdebug.h> 37 #include <linux/prctl.h> 38 #include <linux/uaccess.h> 39 #include <linux/io.h> 40 #include <linux/ftrace.h> 41 #include <linux/syscalls.h> 42 #include <linux/iommu.h> 43 44 #include <asm/processor.h> 45 #include <asm/pkru.h> 46 #include <asm/fpu/sched.h> 47 #include <asm/mmu_context.h> 48 #include <asm/prctl.h> 49 #include <asm/desc.h> 50 #include <asm/proto.h> 51 #include <asm/ia32.h> 52 #include <asm/debugreg.h> 53 #include <asm/switch_to.h> 54 #include <asm/xen/hypervisor.h> 55 #include <asm/vdso.h> 56 #include <asm/resctrl.h> 57 #include <asm/unistd.h> 58 #include <asm/fsgsbase.h> 59 #include <asm/fred.h> 60 #include <asm/msr.h> 61 #ifdef CONFIG_IA32_EMULATION 62 /* Not included via unistd.h */ 63 #include <asm/unistd_32_ia32.h> 64 #endif 65 66 #include "process.h" 67 68 /* Prints also some state that isn't saved in the pt_regs */ 69 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, 70 const char *log_lvl) 71 { 72 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 73 unsigned long d0, d1, d2, d3, d6, d7; 74 unsigned int fsindex, gsindex; 75 unsigned int ds, es; 76 77 show_iret_regs(regs, log_lvl); 78 79 if (regs->orig_ax != -1) 80 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); 81 else 82 pr_cont("\n"); 83 84 printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", 85 log_lvl, regs->ax, regs->bx, regs->cx); 86 printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", 87 log_lvl, regs->dx, regs->si, regs->di); 88 printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", 89 log_lvl, regs->bp, regs->r8, regs->r9); 90 printk("%sR10: %016lx R11: %016lx R12: %016lx\n", 91 log_lvl, regs->r10, regs->r11, regs->r12); 92 printk("%sR13: %016lx R14: %016lx R15: %016lx\n", 93 log_lvl, regs->r13, regs->r14, regs->r15); 94 95 if (mode == SHOW_REGS_SHORT) 96 return; 97 98 if (mode == SHOW_REGS_USER) { 99 rdmsrq(MSR_FS_BASE, fs); 100 rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); 101 printk("%sFS: %016lx GS: %016lx\n", 102 log_lvl, fs, shadowgs); 103 return; 104 } 105 106 asm("movl %%ds,%0" : "=r" (ds)); 107 asm("movl %%es,%0" : "=r" (es)); 108 asm("movl %%fs,%0" : "=r" (fsindex)); 109 asm("movl %%gs,%0" : "=r" (gsindex)); 110 111 rdmsrq(MSR_FS_BASE, fs); 112 rdmsrq(MSR_GS_BASE, gs); 113 rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); 114 115 cr0 = read_cr0(); 116 cr2 = read_cr2(); 117 cr3 = __read_cr3(); 118 cr4 = __read_cr4(); 119 120 printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 121 log_lvl, fs, fsindex, gs, gsindex, shadowgs); 122 printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", 123 log_lvl, regs->cs, ds, es, cr0); 124 printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", 125 log_lvl, cr2, cr3, cr4); 126 127 get_debugreg(d0, 0); 128 get_debugreg(d1, 1); 129 get_debugreg(d2, 2); 130 get_debugreg(d3, 3); 131 get_debugreg(d6, 6); 132 get_debugreg(d7, 7); 133 134 /* Only print out debug registers if they are in their non-default state. */ 135 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 136 (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { 137 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", 138 log_lvl, d0, d1, d2); 139 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", 140 log_lvl, d3, d6, d7); 141 } 142 143 if (cr4 & X86_CR4_PKE) 144 printk("%sPKRU: %08x\n", log_lvl, read_pkru()); 145 } 146 147 void release_thread(struct task_struct *dead_task) 148 { 149 WARN_ON(dead_task->mm); 150 } 151 152 enum which_selector { 153 FS, 154 GS 155 }; 156 157 /* 158 * Out of line to be protected from kprobes and tracing. If this would be 159 * traced or probed than any access to a per CPU variable happens with 160 * the wrong GS. 161 * 162 * It is not used on Xen paravirt. When paravirt support is needed, it 163 * needs to be renamed with native_ prefix. 164 */ 165 static noinstr unsigned long __rdgsbase_inactive(void) 166 { 167 unsigned long gsbase; 168 169 lockdep_assert_irqs_disabled(); 170 171 /* 172 * SWAPGS is no longer needed thus NOT allowed with FRED because 173 * FRED transitions ensure that an operating system can _always_ 174 * operate with its own GS base address: 175 * - For events that occur in ring 3, FRED event delivery swaps 176 * the GS base address with the IA32_KERNEL_GS_BASE MSR. 177 * - ERETU (the FRED transition that returns to ring 3) also swaps 178 * the GS base address with the IA32_KERNEL_GS_BASE MSR. 179 * 180 * And the operating system can still setup the GS segment for a 181 * user thread without the need of loading a user thread GS with: 182 * - Using LKGS, available with FRED, to modify other attributes 183 * of the GS segment without compromising its ability always to 184 * operate with its own GS base address. 185 * - Accessing the GS segment base address for a user thread as 186 * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. 187 * 188 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE 189 * MSR instead of the GS segment’s descriptor cache. As such, the 190 * operating system never changes its runtime GS base address. 191 */ 192 if (!cpu_feature_enabled(X86_FEATURE_FRED) && 193 !cpu_feature_enabled(X86_FEATURE_XENPV)) { 194 native_swapgs(); 195 gsbase = rdgsbase(); 196 native_swapgs(); 197 } else { 198 instrumentation_begin(); 199 rdmsrq(MSR_KERNEL_GS_BASE, gsbase); 200 instrumentation_end(); 201 } 202 203 return gsbase; 204 } 205 206 /* 207 * Out of line to be protected from kprobes and tracing. If this would be 208 * traced or probed than any access to a per CPU variable happens with 209 * the wrong GS. 210 * 211 * It is not used on Xen paravirt. When paravirt support is needed, it 212 * needs to be renamed with native_ prefix. 213 */ 214 static noinstr void __wrgsbase_inactive(unsigned long gsbase) 215 { 216 lockdep_assert_irqs_disabled(); 217 218 if (!cpu_feature_enabled(X86_FEATURE_FRED) && 219 !cpu_feature_enabled(X86_FEATURE_XENPV)) { 220 native_swapgs(); 221 wrgsbase(gsbase); 222 native_swapgs(); 223 } else { 224 instrumentation_begin(); 225 wrmsrq(MSR_KERNEL_GS_BASE, gsbase); 226 instrumentation_end(); 227 } 228 } 229 230 /* 231 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are 232 * not available. The goal is to be reasonably fast on non-FSGSBASE systems. 233 * It's forcibly inlined because it'll generate better code and this function 234 * is hot. 235 */ 236 static __always_inline void save_base_legacy(struct task_struct *prev_p, 237 unsigned short selector, 238 enum which_selector which) 239 { 240 if (likely(selector == 0)) { 241 /* 242 * On Intel (without X86_BUG_NULL_SEG), the segment base could 243 * be the pre-existing saved base or it could be zero. On AMD 244 * (with X86_BUG_NULL_SEG), the segment base could be almost 245 * anything. 246 * 247 * This branch is very hot (it's hit twice on almost every 248 * context switch between 64-bit programs), and avoiding 249 * the RDMSR helps a lot, so we just assume that whatever 250 * value is already saved is correct. This matches historical 251 * Linux behavior, so it won't break existing applications. 252 * 253 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we 254 * report that the base is zero, it needs to actually be zero: 255 * see the corresponding logic in load_seg_legacy. 256 */ 257 } else { 258 /* 259 * If the selector is 1, 2, or 3, then the base is zero on 260 * !X86_BUG_NULL_SEG CPUs and could be anything on 261 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux 262 * has never attempted to preserve the base across context 263 * switches. 264 * 265 * If selector > 3, then it refers to a real segment, and 266 * saving the base isn't necessary. 267 */ 268 if (which == FS) 269 prev_p->thread.fsbase = 0; 270 else 271 prev_p->thread.gsbase = 0; 272 } 273 } 274 275 static __always_inline void save_fsgs(struct task_struct *task) 276 { 277 savesegment(fs, task->thread.fsindex); 278 savesegment(gs, task->thread.gsindex); 279 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 280 /* 281 * If FSGSBASE is enabled, we can't make any useful guesses 282 * about the base, and user code expects us to save the current 283 * value. Fortunately, reading the base directly is efficient. 284 */ 285 task->thread.fsbase = rdfsbase(); 286 task->thread.gsbase = __rdgsbase_inactive(); 287 } else { 288 save_base_legacy(task, task->thread.fsindex, FS); 289 save_base_legacy(task, task->thread.gsindex, GS); 290 } 291 } 292 293 /* 294 * While a process is running,current->thread.fsbase and current->thread.gsbase 295 * may not match the corresponding CPU registers (see save_base_legacy()). 296 */ 297 void current_save_fsgs(void) 298 { 299 unsigned long flags; 300 301 /* Interrupts need to be off for FSGSBASE */ 302 local_irq_save(flags); 303 save_fsgs(current); 304 local_irq_restore(flags); 305 } 306 #if IS_ENABLED(CONFIG_KVM) 307 EXPORT_SYMBOL_GPL(current_save_fsgs); 308 #endif 309 310 static __always_inline void loadseg(enum which_selector which, 311 unsigned short sel) 312 { 313 if (which == FS) 314 loadsegment(fs, sel); 315 else 316 load_gs_index(sel); 317 } 318 319 static __always_inline void load_seg_legacy(unsigned short prev_index, 320 unsigned long prev_base, 321 unsigned short next_index, 322 unsigned long next_base, 323 enum which_selector which) 324 { 325 if (likely(next_index <= 3)) { 326 /* 327 * The next task is using 64-bit TLS, is not using this 328 * segment at all, or is having fun with arcane CPU features. 329 */ 330 if (next_base == 0) { 331 /* 332 * Nasty case: on AMD CPUs, we need to forcibly zero 333 * the base. 334 */ 335 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { 336 loadseg(which, __USER_DS); 337 loadseg(which, next_index); 338 } else { 339 /* 340 * We could try to exhaustively detect cases 341 * under which we can skip the segment load, 342 * but there's really only one case that matters 343 * for performance: if both the previous and 344 * next states are fully zeroed, we can skip 345 * the load. 346 * 347 * (This assumes that prev_base == 0 has no 348 * false positives. This is the case on 349 * Intel-style CPUs.) 350 */ 351 if (likely(prev_index | next_index | prev_base)) 352 loadseg(which, next_index); 353 } 354 } else { 355 if (prev_index != next_index) 356 loadseg(which, next_index); 357 wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, 358 next_base); 359 } 360 } else { 361 /* 362 * The next task is using a real segment. Loading the selector 363 * is sufficient. 364 */ 365 loadseg(which, next_index); 366 } 367 } 368 369 /* 370 * Store prev's PKRU value and load next's PKRU value if they differ. PKRU 371 * is not XSTATE managed on context switch because that would require a 372 * lookup in the task's FPU xsave buffer and require to keep that updated 373 * in various places. 374 */ 375 static __always_inline void x86_pkru_load(struct thread_struct *prev, 376 struct thread_struct *next) 377 { 378 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 379 return; 380 381 /* Stash the prev task's value: */ 382 prev->pkru = rdpkru(); 383 384 /* 385 * PKRU writes are slightly expensive. Avoid them when not 386 * strictly necessary: 387 */ 388 if (prev->pkru != next->pkru) 389 wrpkru(next->pkru); 390 } 391 392 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, 393 struct thread_struct *next) 394 { 395 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 396 /* Update the FS and GS selectors if they could have changed. */ 397 if (unlikely(prev->fsindex || next->fsindex)) 398 loadseg(FS, next->fsindex); 399 if (unlikely(prev->gsindex || next->gsindex)) 400 loadseg(GS, next->gsindex); 401 402 /* Update the bases. */ 403 wrfsbase(next->fsbase); 404 __wrgsbase_inactive(next->gsbase); 405 } else { 406 load_seg_legacy(prev->fsindex, prev->fsbase, 407 next->fsindex, next->fsbase, FS); 408 load_seg_legacy(prev->gsindex, prev->gsbase, 409 next->gsindex, next->gsbase, GS); 410 } 411 } 412 413 unsigned long x86_fsgsbase_read_task(struct task_struct *task, 414 unsigned short selector) 415 { 416 unsigned short idx = selector >> 3; 417 unsigned long base; 418 419 if (likely((selector & SEGMENT_TI_MASK) == 0)) { 420 if (unlikely(idx >= GDT_ENTRIES)) 421 return 0; 422 423 /* 424 * There are no user segments in the GDT with nonzero bases 425 * other than the TLS segments. 426 */ 427 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 428 return 0; 429 430 idx -= GDT_ENTRY_TLS_MIN; 431 base = get_desc_base(&task->thread.tls_array[idx]); 432 } else { 433 #ifdef CONFIG_MODIFY_LDT_SYSCALL 434 struct ldt_struct *ldt; 435 436 /* 437 * If performance here mattered, we could protect the LDT 438 * with RCU. This is a slow path, though, so we can just 439 * take the mutex. 440 */ 441 mutex_lock(&task->mm->context.lock); 442 ldt = task->mm->context.ldt; 443 if (unlikely(!ldt || idx >= ldt->nr_entries)) 444 base = 0; 445 else 446 base = get_desc_base(ldt->entries + idx); 447 mutex_unlock(&task->mm->context.lock); 448 #else 449 base = 0; 450 #endif 451 } 452 453 return base; 454 } 455 456 unsigned long x86_gsbase_read_cpu_inactive(void) 457 { 458 unsigned long gsbase; 459 460 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { 461 unsigned long flags; 462 463 local_irq_save(flags); 464 gsbase = __rdgsbase_inactive(); 465 local_irq_restore(flags); 466 } else { 467 rdmsrq(MSR_KERNEL_GS_BASE, gsbase); 468 } 469 470 return gsbase; 471 } 472 473 void x86_gsbase_write_cpu_inactive(unsigned long gsbase) 474 { 475 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { 476 unsigned long flags; 477 478 local_irq_save(flags); 479 __wrgsbase_inactive(gsbase); 480 local_irq_restore(flags); 481 } else { 482 wrmsrq(MSR_KERNEL_GS_BASE, gsbase); 483 } 484 } 485 486 unsigned long x86_fsbase_read_task(struct task_struct *task) 487 { 488 unsigned long fsbase; 489 490 if (task == current) 491 fsbase = x86_fsbase_read_cpu(); 492 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || 493 (task->thread.fsindex == 0)) 494 fsbase = task->thread.fsbase; 495 else 496 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); 497 498 return fsbase; 499 } 500 501 unsigned long x86_gsbase_read_task(struct task_struct *task) 502 { 503 unsigned long gsbase; 504 505 if (task == current) 506 gsbase = x86_gsbase_read_cpu_inactive(); 507 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || 508 (task->thread.gsindex == 0)) 509 gsbase = task->thread.gsbase; 510 else 511 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); 512 513 return gsbase; 514 } 515 516 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) 517 { 518 WARN_ON_ONCE(task == current); 519 520 task->thread.fsbase = fsbase; 521 } 522 523 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) 524 { 525 WARN_ON_ONCE(task == current); 526 527 task->thread.gsbase = gsbase; 528 } 529 530 static void 531 start_thread_common(struct pt_regs *regs, unsigned long new_ip, 532 unsigned long new_sp, 533 u16 _cs, u16 _ss, u16 _ds) 534 { 535 WARN_ON_ONCE(regs != current_pt_regs()); 536 537 if (static_cpu_has(X86_BUG_NULL_SEG)) { 538 /* Loading zero below won't clear the base. */ 539 loadsegment(fs, __USER_DS); 540 load_gs_index(__USER_DS); 541 } 542 543 reset_thread_features(); 544 545 loadsegment(fs, 0); 546 loadsegment(es, _ds); 547 loadsegment(ds, _ds); 548 load_gs_index(0); 549 550 regs->ip = new_ip; 551 regs->sp = new_sp; 552 regs->csx = _cs; 553 regs->ssx = _ss; 554 /* 555 * Allow single-step trap and NMI when starting a new task, thus 556 * once the new task enters user space, single-step trap and NMI 557 * are both enabled immediately. 558 * 559 * Entering a new task is logically speaking a return from a 560 * system call (exec, fork, clone, etc.). As such, if ptrace 561 * enables single stepping a single step exception should be 562 * allowed to trigger immediately upon entering user space. 563 * This is not optional. 564 * 565 * NMI should *never* be disabled in user space. As such, this 566 * is an optional, opportunistic way to catch errors. 567 * 568 * Paranoia: High-order 48 bits above the lowest 16 bit SS are 569 * discarded by the legacy IRET instruction on all Intel, AMD, 570 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, 571 * even when FRED is not enabled. But we choose the safer side 572 * to use these bits only when FRED is enabled. 573 */ 574 if (cpu_feature_enabled(X86_FEATURE_FRED)) { 575 regs->fred_ss.swevent = true; 576 regs->fred_ss.nmi = true; 577 } 578 579 regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; 580 } 581 582 void 583 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 584 { 585 start_thread_common(regs, new_ip, new_sp, 586 __USER_CS, __USER_DS, 0); 587 } 588 EXPORT_SYMBOL_GPL(start_thread); 589 590 #ifdef CONFIG_COMPAT 591 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) 592 { 593 start_thread_common(regs, new_ip, new_sp, 594 x32 ? __USER_CS : __USER32_CS, 595 __USER_DS, __USER_DS); 596 } 597 #endif 598 599 /* 600 * switch_to(x,y) should switch tasks from x to y. 601 * 602 * This could still be optimized: 603 * - fold all the options into a flag word and test it with a single test. 604 * - could test fs/gs bitsliced 605 * 606 * Kprobes not supported here. Set the probe on schedule instead. 607 * Function graph tracer not supported too. 608 */ 609 __no_kmsan_checks 610 __visible __notrace_funcgraph struct task_struct * 611 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 612 { 613 struct thread_struct *prev = &prev_p->thread; 614 struct thread_struct *next = &next_p->thread; 615 int cpu = smp_processor_id(); 616 617 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && 618 this_cpu_read(hardirq_stack_inuse)); 619 620 switch_fpu(prev_p, cpu); 621 622 /* We must save %fs and %gs before load_TLS() because 623 * %fs and %gs may be cleared by load_TLS(). 624 * 625 * (e.g. xen_load_tls()) 626 */ 627 save_fsgs(prev_p); 628 629 /* 630 * Load TLS before restoring any segments so that segment loads 631 * reference the correct GDT entries. 632 */ 633 load_TLS(next, cpu); 634 635 /* 636 * Leave lazy mode, flushing any hypercalls made here. This 637 * must be done after loading TLS entries in the GDT but before 638 * loading segments that might reference them. 639 */ 640 arch_end_context_switch(next_p); 641 642 /* Switch DS and ES. 643 * 644 * Reading them only returns the selectors, but writing them (if 645 * nonzero) loads the full descriptor from the GDT or LDT. The 646 * LDT for next is loaded in switch_mm, and the GDT is loaded 647 * above. 648 * 649 * We therefore need to write new values to the segment 650 * registers on every context switch unless both the new and old 651 * values are zero. 652 * 653 * Note that we don't need to do anything for CS and SS, as 654 * those are saved and restored as part of pt_regs. 655 */ 656 savesegment(es, prev->es); 657 if (unlikely(next->es | prev->es)) 658 loadsegment(es, next->es); 659 660 savesegment(ds, prev->ds); 661 if (unlikely(next->ds | prev->ds)) 662 loadsegment(ds, next->ds); 663 664 x86_fsgsbase_load(prev, next); 665 666 x86_pkru_load(prev, next); 667 668 /* 669 * Switch the PDA and FPU contexts. 670 */ 671 raw_cpu_write(current_task, next_p); 672 raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); 673 674 /* Reload sp0. */ 675 update_task_stack(next_p); 676 677 switch_to_extra(prev_p, next_p); 678 679 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { 680 /* 681 * AMD CPUs have a misfeature: SYSRET sets the SS selector but 682 * does not update the cached descriptor. As a result, if we 683 * do SYSRET while SS is NULL, we'll end up in user mode with 684 * SS apparently equal to __USER_DS but actually unusable. 685 * 686 * The straightforward workaround would be to fix it up just 687 * before SYSRET, but that would slow down the system call 688 * fast paths. Instead, we ensure that SS is never NULL in 689 * system call context. We do this by replacing NULL SS 690 * selectors at every context switch. SYSCALL sets up a valid 691 * SS, so the only way to get NULL is to re-enter the kernel 692 * from CPL 3 through an interrupt. Since that can't happen 693 * in the same task as a running syscall, we are guaranteed to 694 * context switch between every interrupt vector entry and a 695 * subsequent SYSRET. 696 * 697 * We read SS first because SS reads are much faster than 698 * writes. Out of caution, we force SS to __KERNEL_DS even if 699 * it previously had a different non-NULL value. 700 */ 701 unsigned short ss_sel; 702 savesegment(ss, ss_sel); 703 if (ss_sel != __KERNEL_DS) 704 loadsegment(ss, __KERNEL_DS); 705 } 706 707 /* Load the Intel cache allocation PQR MSR. */ 708 resctrl_arch_sched_in(next_p); 709 710 /* Reset hw history on AMD CPUs */ 711 if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS)) 712 wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1); 713 714 return prev_p; 715 } 716 717 void set_personality_64bit(void) 718 { 719 /* inherit personality from parent */ 720 721 /* Make sure to be in 64bit mode */ 722 clear_thread_flag(TIF_ADDR32); 723 /* Pretend that this comes from a 64bit execve */ 724 task_pt_regs(current)->orig_ax = __NR_execve; 725 current_thread_info()->status &= ~TS_COMPAT; 726 if (current->mm) 727 __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); 728 729 /* TBD: overwrites user setup. Should have two bits. 730 But 64bit processes have always behaved this way, 731 so it's not too bad. The main problem is just that 732 32bit children are affected again. */ 733 current->personality &= ~READ_IMPLIES_EXEC; 734 } 735 736 static void __set_personality_x32(void) 737 { 738 #ifdef CONFIG_X86_X32_ABI 739 if (current->mm) 740 current->mm->context.flags = 0; 741 742 current->personality &= ~READ_IMPLIES_EXEC; 743 /* 744 * in_32bit_syscall() uses the presence of the x32 syscall bit 745 * flag to determine compat status. The x86 mmap() code relies on 746 * the syscall bitness so set x32 syscall bit right here to make 747 * in_32bit_syscall() work during exec(). 748 * 749 * Pretend to come from a x32 execve. 750 */ 751 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; 752 current_thread_info()->status &= ~TS_COMPAT; 753 #endif 754 } 755 756 static void __set_personality_ia32(void) 757 { 758 #ifdef CONFIG_IA32_EMULATION 759 if (current->mm) { 760 /* 761 * uprobes applied to this MM need to know this and 762 * cannot use user_64bit_mode() at that time. 763 */ 764 __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); 765 } 766 767 current->personality |= force_personality32; 768 /* Prepare the first "return" to user space */ 769 task_pt_regs(current)->orig_ax = __NR_ia32_execve; 770 current_thread_info()->status |= TS_COMPAT; 771 #endif 772 } 773 774 void set_personality_ia32(bool x32) 775 { 776 /* Make sure to be in 32bit mode */ 777 set_thread_flag(TIF_ADDR32); 778 779 if (x32) 780 __set_personality_x32(); 781 else 782 __set_personality_ia32(); 783 } 784 EXPORT_SYMBOL_GPL(set_personality_ia32); 785 786 #ifdef CONFIG_CHECKPOINT_RESTORE 787 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) 788 { 789 int ret; 790 791 ret = map_vdso_once(image, addr); 792 if (ret) 793 return ret; 794 795 return (long)image->size; 796 } 797 #endif 798 799 #ifdef CONFIG_ADDRESS_MASKING 800 801 #define LAM_U57_BITS 6 802 803 static void enable_lam_func(void *__mm) 804 { 805 struct mm_struct *mm = __mm; 806 unsigned long lam; 807 808 if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { 809 lam = mm_lam_cr3_mask(mm); 810 write_cr3(__read_cr3() | lam); 811 cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); 812 } 813 } 814 815 static void mm_enable_lam(struct mm_struct *mm) 816 { 817 mm->context.lam_cr3_mask = X86_CR3_LAM_U57; 818 mm->context.untag_mask = ~GENMASK(62, 57); 819 820 /* 821 * Even though the process must still be single-threaded at this 822 * point, kernel threads may be using the mm. IPI those kernel 823 * threads if they exist. 824 */ 825 on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); 826 set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); 827 } 828 829 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) 830 { 831 if (!cpu_feature_enabled(X86_FEATURE_LAM)) 832 return -ENODEV; 833 834 /* PTRACE_ARCH_PRCTL */ 835 if (current->mm != mm) 836 return -EINVAL; 837 838 if (mm_valid_pasid(mm) && 839 !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) 840 return -EINVAL; 841 842 if (mmap_write_lock_killable(mm)) 843 return -EINTR; 844 845 /* 846 * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from 847 * being enabled unless the process is single threaded: 848 */ 849 if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { 850 mmap_write_unlock(mm); 851 return -EBUSY; 852 } 853 854 if (!nr_bits || nr_bits > LAM_U57_BITS) { 855 mmap_write_unlock(mm); 856 return -EINVAL; 857 } 858 859 mm_enable_lam(mm); 860 861 mmap_write_unlock(mm); 862 863 return 0; 864 } 865 #endif 866 867 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) 868 { 869 int ret = 0; 870 871 switch (option) { 872 case ARCH_SET_GS: { 873 if (unlikely(arg2 >= TASK_SIZE_MAX)) 874 return -EPERM; 875 876 preempt_disable(); 877 /* 878 * ARCH_SET_GS has always overwritten the index 879 * and the base. Zero is the most sensible value 880 * to put in the index, and is the only value that 881 * makes any sense if FSGSBASE is unavailable. 882 */ 883 if (task == current) { 884 loadseg(GS, 0); 885 x86_gsbase_write_cpu_inactive(arg2); 886 887 /* 888 * On non-FSGSBASE systems, save_base_legacy() expects 889 * that we also fill in thread.gsbase. 890 */ 891 task->thread.gsbase = arg2; 892 893 } else { 894 task->thread.gsindex = 0; 895 x86_gsbase_write_task(task, arg2); 896 } 897 preempt_enable(); 898 break; 899 } 900 case ARCH_SET_FS: { 901 /* 902 * Not strictly needed for %fs, but do it for symmetry 903 * with %gs 904 */ 905 if (unlikely(arg2 >= TASK_SIZE_MAX)) 906 return -EPERM; 907 908 preempt_disable(); 909 /* 910 * Set the selector to 0 for the same reason 911 * as %gs above. 912 */ 913 if (task == current) { 914 loadseg(FS, 0); 915 x86_fsbase_write_cpu(arg2); 916 917 /* 918 * On non-FSGSBASE systems, save_base_legacy() expects 919 * that we also fill in thread.fsbase. 920 */ 921 task->thread.fsbase = arg2; 922 } else { 923 task->thread.fsindex = 0; 924 x86_fsbase_write_task(task, arg2); 925 } 926 preempt_enable(); 927 break; 928 } 929 case ARCH_GET_FS: { 930 unsigned long base = x86_fsbase_read_task(task); 931 932 ret = put_user(base, (unsigned long __user *)arg2); 933 break; 934 } 935 case ARCH_GET_GS: { 936 unsigned long base = x86_gsbase_read_task(task); 937 938 ret = put_user(base, (unsigned long __user *)arg2); 939 break; 940 } 941 942 #ifdef CONFIG_CHECKPOINT_RESTORE 943 # ifdef CONFIG_X86_X32_ABI 944 case ARCH_MAP_VDSO_X32: 945 return prctl_map_vdso(&vdso_image_x32, arg2); 946 # endif 947 # ifdef CONFIG_IA32_EMULATION 948 case ARCH_MAP_VDSO_32: 949 return prctl_map_vdso(&vdso_image_32, arg2); 950 # endif 951 case ARCH_MAP_VDSO_64: 952 return prctl_map_vdso(&vdso_image_64, arg2); 953 #endif 954 #ifdef CONFIG_ADDRESS_MASKING 955 case ARCH_GET_UNTAG_MASK: 956 return put_user(task->mm->context.untag_mask, 957 (unsigned long __user *)arg2); 958 case ARCH_ENABLE_TAGGED_ADDR: 959 return prctl_enable_tagged_addr(task->mm, arg2); 960 case ARCH_FORCE_TAGGED_SVA: 961 if (current != task) 962 return -EINVAL; 963 set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); 964 return 0; 965 case ARCH_GET_MAX_TAG_BITS: 966 if (!cpu_feature_enabled(X86_FEATURE_LAM)) 967 return put_user(0, (unsigned long __user *)arg2); 968 else 969 return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); 970 #endif 971 case ARCH_SHSTK_ENABLE: 972 case ARCH_SHSTK_DISABLE: 973 case ARCH_SHSTK_LOCK: 974 case ARCH_SHSTK_UNLOCK: 975 case ARCH_SHSTK_STATUS: 976 return shstk_prctl(task, option, arg2); 977 default: 978 ret = -EINVAL; 979 break; 980 } 981 982 return ret; 983 } 984