1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/errno.h> 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/smp.h> 8 #include <linux/cpu.h> 9 #include <linux/prctl.h> 10 #include <linux/slab.h> 11 #include <linux/sched.h> 12 #include <linux/sched/idle.h> 13 #include <linux/sched/debug.h> 14 #include <linux/sched/task.h> 15 #include <linux/sched/task_stack.h> 16 #include <linux/init.h> 17 #include <linux/export.h> 18 #include <linux/pm.h> 19 #include <linux/tick.h> 20 #include <linux/random.h> 21 #include <linux/user-return-notifier.h> 22 #include <linux/dmi.h> 23 #include <linux/utsname.h> 24 #include <linux/stackprotector.h> 25 #include <linux/cpuidle.h> 26 #include <linux/acpi.h> 27 #include <linux/elf-randomize.h> 28 #include <linux/static_call.h> 29 #include <trace/events/power.h> 30 #include <linux/hw_breakpoint.h> 31 #include <asm/cpu.h> 32 #include <asm/apic.h> 33 #include <linux/uaccess.h> 34 #include <asm/mwait.h> 35 #include <asm/fpu/api.h> 36 #include <asm/fpu/sched.h> 37 #include <asm/fpu/xstate.h> 38 #include <asm/debugreg.h> 39 #include <asm/nmi.h> 40 #include <asm/tlbflush.h> 41 #include <asm/mce.h> 42 #include <asm/vm86.h> 43 #include <asm/switch_to.h> 44 #include <asm/desc.h> 45 #include <asm/prctl.h> 46 #include <asm/spec-ctrl.h> 47 #include <asm/io_bitmap.h> 48 #include <asm/proto.h> 49 #include <asm/frame.h> 50 #include <asm/unwind.h> 51 #include <asm/tdx.h> 52 #include <asm/mmu_context.h> 53 54 #include "process.h" 55 56 /* 57 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 58 * no more per-task TSS's. The TSS size is kept cacheline-aligned 59 * so they are allowed to end up in the .data..cacheline_aligned 60 * section. Since TSS's are completely CPU-local, we want them 61 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 62 */ 63 __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { 64 .x86_tss = { 65 /* 66 * .sp0 is only used when entering ring 0 from a lower 67 * privilege level. Since the init task never runs anything 68 * but ring 0 code, there is no need for a valid value here. 69 * Poison it. 70 */ 71 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, 72 73 #ifdef CONFIG_X86_32 74 .sp1 = TOP_OF_INIT_STACK, 75 76 .ss0 = __KERNEL_DS, 77 .ss1 = __KERNEL_CS, 78 #endif 79 .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, 80 }, 81 }; 82 EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); 83 84 DEFINE_PER_CPU(bool, __tss_limit_invalid); 85 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); 86 87 /* 88 * this gets called so that we can store lazy state into memory and copy the 89 * current task into the new thread. 90 */ 91 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 92 { 93 memcpy(dst, src, arch_task_struct_size); 94 #ifdef CONFIG_VM86 95 dst->thread.vm86 = NULL; 96 #endif 97 /* Drop the copied pointer to current's fpstate */ 98 dst->thread.fpu.fpstate = NULL; 99 100 return 0; 101 } 102 103 #ifdef CONFIG_X86_64 104 void arch_release_task_struct(struct task_struct *tsk) 105 { 106 if (fpu_state_size_dynamic()) 107 fpstate_free(&tsk->thread.fpu); 108 } 109 #endif 110 111 /* 112 * Free thread data structures etc.. 113 */ 114 void exit_thread(struct task_struct *tsk) 115 { 116 struct thread_struct *t = &tsk->thread; 117 struct fpu *fpu = &t->fpu; 118 119 if (test_thread_flag(TIF_IO_BITMAP)) 120 io_bitmap_exit(tsk); 121 122 free_vm86(t); 123 124 fpu__drop(fpu); 125 } 126 127 static int set_new_tls(struct task_struct *p, unsigned long tls) 128 { 129 struct user_desc __user *utls = (struct user_desc __user *)tls; 130 131 if (in_ia32_syscall()) 132 return do_set_thread_area(p, -1, utls, 0); 133 else 134 return do_set_thread_area_64(p, ARCH_SET_FS, tls); 135 } 136 137 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) 138 { 139 unsigned long clone_flags = args->flags; 140 unsigned long sp = args->stack; 141 unsigned long tls = args->tls; 142 struct inactive_task_frame *frame; 143 struct fork_frame *fork_frame; 144 struct pt_regs *childregs; 145 int ret = 0; 146 147 childregs = task_pt_regs(p); 148 fork_frame = container_of(childregs, struct fork_frame, regs); 149 frame = &fork_frame->frame; 150 151 frame->bp = encode_frame_pointer(childregs); 152 frame->ret_addr = (unsigned long) ret_from_fork; 153 p->thread.sp = (unsigned long) fork_frame; 154 p->thread.io_bitmap = NULL; 155 p->thread.iopl_warn = 0; 156 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 157 158 #ifdef CONFIG_X86_64 159 current_save_fsgs(); 160 p->thread.fsindex = current->thread.fsindex; 161 p->thread.fsbase = current->thread.fsbase; 162 p->thread.gsindex = current->thread.gsindex; 163 p->thread.gsbase = current->thread.gsbase; 164 165 savesegment(es, p->thread.es); 166 savesegment(ds, p->thread.ds); 167 168 if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) 169 set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); 170 #else 171 p->thread.sp0 = (unsigned long) (childregs + 1); 172 savesegment(gs, p->thread.gs); 173 /* 174 * Clear all status flags including IF and set fixed bit. 64bit 175 * does not have this initialization as the frame does not contain 176 * flags. The flags consistency (especially vs. AC) is there 177 * ensured via objtool, which lacks 32bit support. 178 */ 179 frame->flags = X86_EFLAGS_FIXED; 180 #endif 181 182 fpu_clone(p, clone_flags, args->fn); 183 184 /* Kernel thread ? */ 185 if (unlikely(p->flags & PF_KTHREAD)) { 186 p->thread.pkru = pkru_get_init_value(); 187 memset(childregs, 0, sizeof(struct pt_regs)); 188 kthread_frame_init(frame, args->fn, args->fn_arg); 189 return 0; 190 } 191 192 /* 193 * Clone current's PKRU value from hardware. tsk->thread.pkru 194 * is only valid when scheduled out. 195 */ 196 p->thread.pkru = read_pkru(); 197 198 frame->bx = 0; 199 *childregs = *current_pt_regs(); 200 childregs->ax = 0; 201 if (sp) 202 childregs->sp = sp; 203 204 if (unlikely(args->fn)) { 205 /* 206 * A user space thread, but it doesn't return to 207 * ret_after_fork(). 208 * 209 * In order to indicate that to tools like gdb, 210 * we reset the stack and instruction pointers. 211 * 212 * It does the same kernel frame setup to return to a kernel 213 * function that a kernel thread does. 214 */ 215 childregs->sp = 0; 216 childregs->ip = 0; 217 kthread_frame_init(frame, args->fn, args->fn_arg); 218 return 0; 219 } 220 221 /* Set a new TLS for the child thread? */ 222 if (clone_flags & CLONE_SETTLS) 223 ret = set_new_tls(p, tls); 224 225 if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) 226 io_bitmap_share(p); 227 228 return ret; 229 } 230 231 static void pkru_flush_thread(void) 232 { 233 /* 234 * If PKRU is enabled the default PKRU value has to be loaded into 235 * the hardware right here (similar to context switch). 236 */ 237 pkru_write_default(); 238 } 239 240 void flush_thread(void) 241 { 242 struct task_struct *tsk = current; 243 244 flush_ptrace_hw_breakpoint(tsk); 245 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 246 247 fpu_flush_thread(); 248 pkru_flush_thread(); 249 } 250 251 void disable_TSC(void) 252 { 253 preempt_disable(); 254 if (!test_and_set_thread_flag(TIF_NOTSC)) 255 /* 256 * Must flip the CPU state synchronously with 257 * TIF_NOTSC in the current running context. 258 */ 259 cr4_set_bits(X86_CR4_TSD); 260 preempt_enable(); 261 } 262 263 static void enable_TSC(void) 264 { 265 preempt_disable(); 266 if (test_and_clear_thread_flag(TIF_NOTSC)) 267 /* 268 * Must flip the CPU state synchronously with 269 * TIF_NOTSC in the current running context. 270 */ 271 cr4_clear_bits(X86_CR4_TSD); 272 preempt_enable(); 273 } 274 275 int get_tsc_mode(unsigned long adr) 276 { 277 unsigned int val; 278 279 if (test_thread_flag(TIF_NOTSC)) 280 val = PR_TSC_SIGSEGV; 281 else 282 val = PR_TSC_ENABLE; 283 284 return put_user(val, (unsigned int __user *)adr); 285 } 286 287 int set_tsc_mode(unsigned int val) 288 { 289 if (val == PR_TSC_SIGSEGV) 290 disable_TSC(); 291 else if (val == PR_TSC_ENABLE) 292 enable_TSC(); 293 else 294 return -EINVAL; 295 296 return 0; 297 } 298 299 DEFINE_PER_CPU(u64, msr_misc_features_shadow); 300 301 static void set_cpuid_faulting(bool on) 302 { 303 u64 msrval; 304 305 msrval = this_cpu_read(msr_misc_features_shadow); 306 msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; 307 msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); 308 this_cpu_write(msr_misc_features_shadow, msrval); 309 wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); 310 } 311 312 static void disable_cpuid(void) 313 { 314 preempt_disable(); 315 if (!test_and_set_thread_flag(TIF_NOCPUID)) { 316 /* 317 * Must flip the CPU state synchronously with 318 * TIF_NOCPUID in the current running context. 319 */ 320 set_cpuid_faulting(true); 321 } 322 preempt_enable(); 323 } 324 325 static void enable_cpuid(void) 326 { 327 preempt_disable(); 328 if (test_and_clear_thread_flag(TIF_NOCPUID)) { 329 /* 330 * Must flip the CPU state synchronously with 331 * TIF_NOCPUID in the current running context. 332 */ 333 set_cpuid_faulting(false); 334 } 335 preempt_enable(); 336 } 337 338 static int get_cpuid_mode(void) 339 { 340 return !test_thread_flag(TIF_NOCPUID); 341 } 342 343 static int set_cpuid_mode(unsigned long cpuid_enabled) 344 { 345 if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) 346 return -ENODEV; 347 348 if (cpuid_enabled) 349 enable_cpuid(); 350 else 351 disable_cpuid(); 352 353 return 0; 354 } 355 356 /* 357 * Called immediately after a successful exec. 358 */ 359 void arch_setup_new_exec(void) 360 { 361 /* If cpuid was previously disabled for this task, re-enable it. */ 362 if (test_thread_flag(TIF_NOCPUID)) 363 enable_cpuid(); 364 365 /* 366 * Don't inherit TIF_SSBD across exec boundary when 367 * PR_SPEC_DISABLE_NOEXEC is used. 368 */ 369 if (test_thread_flag(TIF_SSBD) && 370 task_spec_ssb_noexec(current)) { 371 clear_thread_flag(TIF_SSBD); 372 task_clear_spec_ssb_disable(current); 373 task_clear_spec_ssb_noexec(current); 374 speculation_ctrl_update(read_thread_flags()); 375 } 376 377 mm_reset_untag_mask(current->mm); 378 } 379 380 #ifdef CONFIG_X86_IOPL_IOPERM 381 static inline void switch_to_bitmap(unsigned long tifp) 382 { 383 /* 384 * Invalidate I/O bitmap if the previous task used it. This prevents 385 * any possible leakage of an active I/O bitmap. 386 * 387 * If the next task has an I/O bitmap it will handle it on exit to 388 * user mode. 389 */ 390 if (tifp & _TIF_IO_BITMAP) 391 tss_invalidate_io_bitmap(); 392 } 393 394 static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) 395 { 396 /* 397 * Copy at least the byte range of the incoming tasks bitmap which 398 * covers the permitted I/O ports. 399 * 400 * If the previous task which used an I/O bitmap had more bits 401 * permitted, then the copy needs to cover those as well so they 402 * get turned off. 403 */ 404 memcpy(tss->io_bitmap.bitmap, iobm->bitmap, 405 max(tss->io_bitmap.prev_max, iobm->max)); 406 407 /* 408 * Store the new max and the sequence number of this bitmap 409 * and a pointer to the bitmap itself. 410 */ 411 tss->io_bitmap.prev_max = iobm->max; 412 tss->io_bitmap.prev_sequence = iobm->sequence; 413 } 414 415 /** 416 * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode 417 */ 418 void native_tss_update_io_bitmap(void) 419 { 420 struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); 421 struct thread_struct *t = ¤t->thread; 422 u16 *base = &tss->x86_tss.io_bitmap_base; 423 424 if (!test_thread_flag(TIF_IO_BITMAP)) { 425 native_tss_invalidate_io_bitmap(); 426 return; 427 } 428 429 if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { 430 *base = IO_BITMAP_OFFSET_VALID_ALL; 431 } else { 432 struct io_bitmap *iobm = t->io_bitmap; 433 434 /* 435 * Only copy bitmap data when the sequence number differs. The 436 * update time is accounted to the incoming task. 437 */ 438 if (tss->io_bitmap.prev_sequence != iobm->sequence) 439 tss_copy_io_bitmap(tss, iobm); 440 441 /* Enable the bitmap */ 442 *base = IO_BITMAP_OFFSET_VALID_MAP; 443 } 444 445 /* 446 * Make sure that the TSS limit is covering the IO bitmap. It might have 447 * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O 448 * access from user space to trigger a #GP because tbe bitmap is outside 449 * the TSS limit. 450 */ 451 refresh_tss_limit(); 452 } 453 #else /* CONFIG_X86_IOPL_IOPERM */ 454 static inline void switch_to_bitmap(unsigned long tifp) { } 455 #endif 456 457 #ifdef CONFIG_SMP 458 459 struct ssb_state { 460 struct ssb_state *shared_state; 461 raw_spinlock_t lock; 462 unsigned int disable_state; 463 unsigned long local_state; 464 }; 465 466 #define LSTATE_SSB 0 467 468 static DEFINE_PER_CPU(struct ssb_state, ssb_state); 469 470 void speculative_store_bypass_ht_init(void) 471 { 472 struct ssb_state *st = this_cpu_ptr(&ssb_state); 473 unsigned int this_cpu = smp_processor_id(); 474 unsigned int cpu; 475 476 st->local_state = 0; 477 478 /* 479 * Shared state setup happens once on the first bringup 480 * of the CPU. It's not destroyed on CPU hotunplug. 481 */ 482 if (st->shared_state) 483 return; 484 485 raw_spin_lock_init(&st->lock); 486 487 /* 488 * Go over HT siblings and check whether one of them has set up the 489 * shared state pointer already. 490 */ 491 for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { 492 if (cpu == this_cpu) 493 continue; 494 495 if (!per_cpu(ssb_state, cpu).shared_state) 496 continue; 497 498 /* Link it to the state of the sibling: */ 499 st->shared_state = per_cpu(ssb_state, cpu).shared_state; 500 return; 501 } 502 503 /* 504 * First HT sibling to come up on the core. Link shared state of 505 * the first HT sibling to itself. The siblings on the same core 506 * which come up later will see the shared state pointer and link 507 * themselves to the state of this CPU. 508 */ 509 st->shared_state = st; 510 } 511 512 /* 513 * Logic is: First HT sibling enables SSBD for both siblings in the core 514 * and last sibling to disable it, disables it for the whole core. This how 515 * MSR_SPEC_CTRL works in "hardware": 516 * 517 * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL 518 */ 519 static __always_inline void amd_set_core_ssb_state(unsigned long tifn) 520 { 521 struct ssb_state *st = this_cpu_ptr(&ssb_state); 522 u64 msr = x86_amd_ls_cfg_base; 523 524 if (!static_cpu_has(X86_FEATURE_ZEN)) { 525 msr |= ssbd_tif_to_amd_ls_cfg(tifn); 526 wrmsrl(MSR_AMD64_LS_CFG, msr); 527 return; 528 } 529 530 if (tifn & _TIF_SSBD) { 531 /* 532 * Since this can race with prctl(), block reentry on the 533 * same CPU. 534 */ 535 if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) 536 return; 537 538 msr |= x86_amd_ls_cfg_ssbd_mask; 539 540 raw_spin_lock(&st->shared_state->lock); 541 /* First sibling enables SSBD: */ 542 if (!st->shared_state->disable_state) 543 wrmsrl(MSR_AMD64_LS_CFG, msr); 544 st->shared_state->disable_state++; 545 raw_spin_unlock(&st->shared_state->lock); 546 } else { 547 if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) 548 return; 549 550 raw_spin_lock(&st->shared_state->lock); 551 st->shared_state->disable_state--; 552 if (!st->shared_state->disable_state) 553 wrmsrl(MSR_AMD64_LS_CFG, msr); 554 raw_spin_unlock(&st->shared_state->lock); 555 } 556 } 557 #else 558 static __always_inline void amd_set_core_ssb_state(unsigned long tifn) 559 { 560 u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); 561 562 wrmsrl(MSR_AMD64_LS_CFG, msr); 563 } 564 #endif 565 566 static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) 567 { 568 /* 569 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, 570 * so ssbd_tif_to_spec_ctrl() just works. 571 */ 572 wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); 573 } 574 575 /* 576 * Update the MSRs managing speculation control, during context switch. 577 * 578 * tifp: Previous task's thread flags 579 * tifn: Next task's thread flags 580 */ 581 static __always_inline void __speculation_ctrl_update(unsigned long tifp, 582 unsigned long tifn) 583 { 584 unsigned long tif_diff = tifp ^ tifn; 585 u64 msr = x86_spec_ctrl_base; 586 bool updmsr = false; 587 588 lockdep_assert_irqs_disabled(); 589 590 /* Handle change of TIF_SSBD depending on the mitigation method. */ 591 if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { 592 if (tif_diff & _TIF_SSBD) 593 amd_set_ssb_virt_state(tifn); 594 } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { 595 if (tif_diff & _TIF_SSBD) 596 amd_set_core_ssb_state(tifn); 597 } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || 598 static_cpu_has(X86_FEATURE_AMD_SSBD)) { 599 updmsr |= !!(tif_diff & _TIF_SSBD); 600 msr |= ssbd_tif_to_spec_ctrl(tifn); 601 } 602 603 /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ 604 if (IS_ENABLED(CONFIG_SMP) && 605 static_branch_unlikely(&switch_to_cond_stibp)) { 606 updmsr |= !!(tif_diff & _TIF_SPEC_IB); 607 msr |= stibp_tif_to_spec_ctrl(tifn); 608 } 609 610 if (updmsr) 611 update_spec_ctrl_cond(msr); 612 } 613 614 static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) 615 { 616 if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { 617 if (task_spec_ssb_disable(tsk)) 618 set_tsk_thread_flag(tsk, TIF_SSBD); 619 else 620 clear_tsk_thread_flag(tsk, TIF_SSBD); 621 622 if (task_spec_ib_disable(tsk)) 623 set_tsk_thread_flag(tsk, TIF_SPEC_IB); 624 else 625 clear_tsk_thread_flag(tsk, TIF_SPEC_IB); 626 } 627 /* Return the updated threadinfo flags*/ 628 return read_task_thread_flags(tsk); 629 } 630 631 void speculation_ctrl_update(unsigned long tif) 632 { 633 unsigned long flags; 634 635 /* Forced update. Make sure all relevant TIF flags are different */ 636 local_irq_save(flags); 637 __speculation_ctrl_update(~tif, tif); 638 local_irq_restore(flags); 639 } 640 641 /* Called from seccomp/prctl update */ 642 void speculation_ctrl_update_current(void) 643 { 644 preempt_disable(); 645 speculation_ctrl_update(speculation_ctrl_update_tif(current)); 646 preempt_enable(); 647 } 648 649 static inline void cr4_toggle_bits_irqsoff(unsigned long mask) 650 { 651 unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); 652 653 newval = cr4 ^ mask; 654 if (newval != cr4) { 655 this_cpu_write(cpu_tlbstate.cr4, newval); 656 __write_cr4(newval); 657 } 658 } 659 660 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) 661 { 662 unsigned long tifp, tifn; 663 664 tifn = read_task_thread_flags(next_p); 665 tifp = read_task_thread_flags(prev_p); 666 667 switch_to_bitmap(tifp); 668 669 propagate_user_return_notify(prev_p, next_p); 670 671 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && 672 arch_has_block_step()) { 673 unsigned long debugctl, msk; 674 675 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 676 debugctl &= ~DEBUGCTLMSR_BTF; 677 msk = tifn & _TIF_BLOCKSTEP; 678 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; 679 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 680 } 681 682 if ((tifp ^ tifn) & _TIF_NOTSC) 683 cr4_toggle_bits_irqsoff(X86_CR4_TSD); 684 685 if ((tifp ^ tifn) & _TIF_NOCPUID) 686 set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); 687 688 if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { 689 __speculation_ctrl_update(tifp, tifn); 690 } else { 691 speculation_ctrl_update_tif(prev_p); 692 tifn = speculation_ctrl_update_tif(next_p); 693 694 /* Enforce MSR update to ensure consistent state */ 695 __speculation_ctrl_update(~tifn, tifn); 696 } 697 } 698 699 /* 700 * Idle related variables and functions 701 */ 702 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 703 EXPORT_SYMBOL(boot_option_idle_override); 704 705 /* 706 * We use this if we don't have any better idle routine.. 707 */ 708 void __cpuidle default_idle(void) 709 { 710 raw_safe_halt(); 711 raw_local_irq_disable(); 712 } 713 #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) 714 EXPORT_SYMBOL(default_idle); 715 #endif 716 717 DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); 718 719 static bool x86_idle_set(void) 720 { 721 return !!static_call_query(x86_idle); 722 } 723 724 #ifndef CONFIG_SMP 725 static inline void __noreturn play_dead(void) 726 { 727 BUG(); 728 } 729 #endif 730 731 void arch_cpu_idle_enter(void) 732 { 733 tsc_verify_tsc_adjust(false); 734 local_touch_nmi(); 735 } 736 737 void __noreturn arch_cpu_idle_dead(void) 738 { 739 play_dead(); 740 } 741 742 /* 743 * Called from the generic idle code. 744 */ 745 void __cpuidle arch_cpu_idle(void) 746 { 747 static_call(x86_idle)(); 748 } 749 EXPORT_SYMBOL_GPL(arch_cpu_idle); 750 751 #ifdef CONFIG_XEN 752 bool xen_set_default_idle(void) 753 { 754 bool ret = x86_idle_set(); 755 756 static_call_update(x86_idle, default_idle); 757 758 return ret; 759 } 760 #endif 761 762 struct cpumask cpus_stop_mask; 763 764 void __noreturn stop_this_cpu(void *dummy) 765 { 766 struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); 767 unsigned int cpu = smp_processor_id(); 768 769 local_irq_disable(); 770 771 /* 772 * Remove this CPU from the online mask and disable it 773 * unconditionally. This might be redundant in case that the reboot 774 * vector was handled late and stop_other_cpus() sent an NMI. 775 * 776 * According to SDM and APM NMIs can be accepted even after soft 777 * disabling the local APIC. 778 */ 779 set_cpu_online(cpu, false); 780 disable_local_APIC(); 781 mcheck_cpu_clear(c); 782 783 /* 784 * Use wbinvd on processors that support SME. This provides support 785 * for performing a successful kexec when going from SME inactive 786 * to SME active (or vice-versa). The cache must be cleared so that 787 * if there are entries with the same physical address, both with and 788 * without the encryption bit, they don't race each other when flushed 789 * and potentially end up with the wrong entry being committed to 790 * memory. 791 * 792 * Test the CPUID bit directly because the machine might've cleared 793 * X86_FEATURE_SME due to cmdline options. 794 */ 795 if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) 796 native_wbinvd(); 797 798 /* 799 * This brings a cache line back and dirties it, but 800 * native_stop_other_cpus() will overwrite cpus_stop_mask after it 801 * observed that all CPUs reported stop. This write will invalidate 802 * the related cache line on this CPU. 803 */ 804 cpumask_clear_cpu(cpu, &cpus_stop_mask); 805 806 for (;;) { 807 /* 808 * Use native_halt() so that memory contents don't change 809 * (stack usage and variables) after possibly issuing the 810 * native_wbinvd() above. 811 */ 812 native_halt(); 813 } 814 } 815 816 /* 817 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power 818 * states (local apic timer and TSC stop). 819 * 820 * XXX this function is completely buggered vs RCU and tracing. 821 */ 822 static void amd_e400_idle(void) 823 { 824 /* 825 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E 826 * gets set after static_cpu_has() places have been converted via 827 * alternatives. 828 */ 829 if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 830 default_idle(); 831 return; 832 } 833 834 tick_broadcast_enter(); 835 836 default_idle(); 837 838 tick_broadcast_exit(); 839 } 840 841 /* 842 * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf 843 * exists and whenever MONITOR/MWAIT extensions are present there is at 844 * least one C1 substate. 845 * 846 * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait 847 * is passed to kernel commandline parameter. 848 */ 849 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 850 { 851 u32 eax, ebx, ecx, edx; 852 853 /* User has disallowed the use of MWAIT. Fallback to HALT */ 854 if (boot_option_idle_override == IDLE_NOMWAIT) 855 return 0; 856 857 /* MWAIT is not supported on this platform. Fallback to HALT */ 858 if (!cpu_has(c, X86_FEATURE_MWAIT)) 859 return 0; 860 861 /* Monitor has a bug. Fallback to HALT */ 862 if (boot_cpu_has_bug(X86_BUG_MONITOR)) 863 return 0; 864 865 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); 866 867 /* 868 * If MWAIT extensions are not available, it is safe to use MWAIT 869 * with EAX=0, ECX=0. 870 */ 871 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) 872 return 1; 873 874 /* 875 * If MWAIT extensions are available, there should be at least one 876 * MWAIT C1 substate present. 877 */ 878 return (edx & MWAIT_C1_SUBSTATE_MASK); 879 } 880 881 /* 882 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 883 * with interrupts enabled and no flags, which is backwards compatible with the 884 * original MWAIT implementation. 885 */ 886 static __cpuidle void mwait_idle(void) 887 { 888 if (!current_set_polling_and_test()) { 889 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 890 mb(); /* quirk */ 891 clflush((void *)¤t_thread_info()->flags); 892 mb(); /* quirk */ 893 } 894 895 __monitor((void *)¤t_thread_info()->flags, 0, 0); 896 if (!need_resched()) { 897 __sti_mwait(0, 0); 898 raw_local_irq_disable(); 899 } 900 } 901 __current_clr_polling(); 902 } 903 904 void select_idle_routine(const struct cpuinfo_x86 *c) 905 { 906 #ifdef CONFIG_SMP 907 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 908 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 909 #endif 910 if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) 911 return; 912 913 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { 914 pr_info("using AMD E400 aware idle routine\n"); 915 static_call_update(x86_idle, amd_e400_idle); 916 } else if (prefer_mwait_c1_over_halt(c)) { 917 pr_info("using mwait in idle threads\n"); 918 static_call_update(x86_idle, mwait_idle); 919 } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { 920 pr_info("using TDX aware idle routine\n"); 921 static_call_update(x86_idle, tdx_safe_halt); 922 } else 923 static_call_update(x86_idle, default_idle); 924 } 925 926 void amd_e400_c1e_apic_setup(void) 927 { 928 if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 929 pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); 930 local_irq_disable(); 931 tick_broadcast_force(); 932 local_irq_enable(); 933 } 934 } 935 936 void __init arch_post_acpi_subsys_init(void) 937 { 938 u32 lo, hi; 939 940 if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) 941 return; 942 943 /* 944 * AMD E400 detection needs to happen after ACPI has been enabled. If 945 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in 946 * MSR_K8_INT_PENDING_MSG. 947 */ 948 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 949 if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) 950 return; 951 952 boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); 953 954 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 955 mark_tsc_unstable("TSC halt in AMD C1E"); 956 pr_info("System has AMD C1E enabled\n"); 957 } 958 959 static int __init idle_setup(char *str) 960 { 961 if (!str) 962 return -EINVAL; 963 964 if (!strcmp(str, "poll")) { 965 pr_info("using polling idle threads\n"); 966 boot_option_idle_override = IDLE_POLL; 967 cpu_idle_poll_ctrl(true); 968 } else if (!strcmp(str, "halt")) { 969 /* 970 * When the boot option of idle=halt is added, halt is 971 * forced to be used for CPU idle. In such case CPU C2/C3 972 * won't be used again. 973 * To continue to load the CPU idle driver, don't touch 974 * the boot_option_idle_override. 975 */ 976 static_call_update(x86_idle, default_idle); 977 boot_option_idle_override = IDLE_HALT; 978 } else if (!strcmp(str, "nomwait")) { 979 /* 980 * If the boot option of "idle=nomwait" is added, 981 * it means that mwait will be disabled for CPU C1/C2/C3 982 * states. 983 */ 984 boot_option_idle_override = IDLE_NOMWAIT; 985 } else 986 return -1; 987 988 return 0; 989 } 990 early_param("idle", idle_setup); 991 992 unsigned long arch_align_stack(unsigned long sp) 993 { 994 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 995 sp -= get_random_u32_below(8192); 996 return sp & ~0xf; 997 } 998 999 unsigned long arch_randomize_brk(struct mm_struct *mm) 1000 { 1001 return randomize_page(mm->brk, 0x02000000); 1002 } 1003 1004 /* 1005 * Called from fs/proc with a reference on @p to find the function 1006 * which called into schedule(). This needs to be done carefully 1007 * because the task might wake up and we might look at a stack 1008 * changing under us. 1009 */ 1010 unsigned long __get_wchan(struct task_struct *p) 1011 { 1012 struct unwind_state state; 1013 unsigned long addr = 0; 1014 1015 if (!try_get_task_stack(p)) 1016 return 0; 1017 1018 for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); 1019 unwind_next_frame(&state)) { 1020 addr = unwind_get_return_address(&state); 1021 if (!addr) 1022 break; 1023 if (in_sched_functions(addr)) 1024 continue; 1025 break; 1026 } 1027 1028 put_task_stack(p); 1029 1030 return addr; 1031 } 1032 1033 long do_arch_prctl_common(int option, unsigned long arg2) 1034 { 1035 switch (option) { 1036 case ARCH_GET_CPUID: 1037 return get_cpuid_mode(); 1038 case ARCH_SET_CPUID: 1039 return set_cpuid_mode(arg2); 1040 case ARCH_GET_XCOMP_SUPP: 1041 case ARCH_GET_XCOMP_PERM: 1042 case ARCH_REQ_XCOMP_PERM: 1043 case ARCH_GET_XCOMP_GUEST_PERM: 1044 case ARCH_REQ_XCOMP_GUEST_PERM: 1045 return fpu_xstate_prctl(option, arg2); 1046 } 1047 1048 return -EINVAL; 1049 } 1050