1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/module.h> 11 #include <linux/pm.h> 12 #include <linux/tick.h> 13 #include <linux/random.h> 14 #include <linux/user-return-notifier.h> 15 #include <linux/dmi.h> 16 #include <linux/utsname.h> 17 #include <linux/stackprotector.h> 18 #include <linux/tick.h> 19 #include <linux/cpuidle.h> 20 #include <trace/events/power.h> 21 #include <linux/hw_breakpoint.h> 22 #include <asm/cpu.h> 23 #include <asm/apic.h> 24 #include <asm/syscalls.h> 25 #include <asm/idle.h> 26 #include <asm/uaccess.h> 27 #include <asm/mwait.h> 28 #include <asm/fpu/internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 #include <asm/tlbflush.h> 32 #include <asm/mce.h> 33 #include <asm/vm86.h> 34 35 /* 36 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 37 * no more per-task TSS's. The TSS size is kept cacheline-aligned 38 * so they are allowed to end up in the .data..cacheline_aligned 39 * section. Since TSS's are completely CPU-local, we want them 40 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 41 */ 42 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 43 .x86_tss = { 44 .sp0 = TOP_OF_INIT_STACK, 45 #ifdef CONFIG_X86_32 46 .ss0 = __KERNEL_DS, 47 .ss1 = __KERNEL_CS, 48 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 49 #endif 50 }, 51 #ifdef CONFIG_X86_32 52 /* 53 * Note that the .io_bitmap member must be extra-big. This is because 54 * the CPU will access an additional byte beyond the end of the IO 55 * permission bitmap. The extra byte must be all 1 bits, and must 56 * be within the limit. 57 */ 58 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 59 #endif 60 #ifdef CONFIG_X86_32 61 .SYSENTER_stack_canary = STACK_END_MAGIC, 62 #endif 63 }; 64 EXPORT_PER_CPU_SYMBOL(cpu_tss); 65 66 #ifdef CONFIG_X86_64 67 static DEFINE_PER_CPU(unsigned char, is_idle); 68 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 69 70 void idle_notifier_register(struct notifier_block *n) 71 { 72 atomic_notifier_chain_register(&idle_notifier, n); 73 } 74 EXPORT_SYMBOL_GPL(idle_notifier_register); 75 76 void idle_notifier_unregister(struct notifier_block *n) 77 { 78 atomic_notifier_chain_unregister(&idle_notifier, n); 79 } 80 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 81 #endif 82 83 /* 84 * this gets called so that we can store lazy state into memory and copy the 85 * current task into the new thread. 86 */ 87 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 88 { 89 memcpy(dst, src, arch_task_struct_size); 90 #ifdef CONFIG_VM86 91 dst->thread.vm86 = NULL; 92 #endif 93 94 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 95 } 96 97 /* 98 * Free current thread data structures etc.. 99 */ 100 void exit_thread(struct task_struct *tsk) 101 { 102 struct thread_struct *t = &tsk->thread; 103 unsigned long *bp = t->io_bitmap_ptr; 104 struct fpu *fpu = &t->fpu; 105 106 if (bp) { 107 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 108 109 t->io_bitmap_ptr = NULL; 110 clear_thread_flag(TIF_IO_BITMAP); 111 /* 112 * Careful, clear this in the TSS too: 113 */ 114 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 115 t->io_bitmap_max = 0; 116 put_cpu(); 117 kfree(bp); 118 } 119 120 free_vm86(t); 121 122 fpu__drop(fpu); 123 } 124 125 void flush_thread(void) 126 { 127 struct task_struct *tsk = current; 128 129 flush_ptrace_hw_breakpoint(tsk); 130 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 131 132 fpu__clear(&tsk->thread.fpu); 133 } 134 135 static void hard_disable_TSC(void) 136 { 137 cr4_set_bits(X86_CR4_TSD); 138 } 139 140 void disable_TSC(void) 141 { 142 preempt_disable(); 143 if (!test_and_set_thread_flag(TIF_NOTSC)) 144 /* 145 * Must flip the CPU state synchronously with 146 * TIF_NOTSC in the current running context. 147 */ 148 hard_disable_TSC(); 149 preempt_enable(); 150 } 151 152 static void hard_enable_TSC(void) 153 { 154 cr4_clear_bits(X86_CR4_TSD); 155 } 156 157 static void enable_TSC(void) 158 { 159 preempt_disable(); 160 if (test_and_clear_thread_flag(TIF_NOTSC)) 161 /* 162 * Must flip the CPU state synchronously with 163 * TIF_NOTSC in the current running context. 164 */ 165 hard_enable_TSC(); 166 preempt_enable(); 167 } 168 169 int get_tsc_mode(unsigned long adr) 170 { 171 unsigned int val; 172 173 if (test_thread_flag(TIF_NOTSC)) 174 val = PR_TSC_SIGSEGV; 175 else 176 val = PR_TSC_ENABLE; 177 178 return put_user(val, (unsigned int __user *)adr); 179 } 180 181 int set_tsc_mode(unsigned int val) 182 { 183 if (val == PR_TSC_SIGSEGV) 184 disable_TSC(); 185 else if (val == PR_TSC_ENABLE) 186 enable_TSC(); 187 else 188 return -EINVAL; 189 190 return 0; 191 } 192 193 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 194 struct tss_struct *tss) 195 { 196 struct thread_struct *prev, *next; 197 198 prev = &prev_p->thread; 199 next = &next_p->thread; 200 201 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 202 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 203 unsigned long debugctl = get_debugctlmsr(); 204 205 debugctl &= ~DEBUGCTLMSR_BTF; 206 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 207 debugctl |= DEBUGCTLMSR_BTF; 208 209 update_debugctlmsr(debugctl); 210 } 211 212 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 213 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 214 /* prev and next are different */ 215 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 216 hard_disable_TSC(); 217 else 218 hard_enable_TSC(); 219 } 220 221 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 222 /* 223 * Copy the relevant range of the IO bitmap. 224 * Normally this is 128 bytes or less: 225 */ 226 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 227 max(prev->io_bitmap_max, next->io_bitmap_max)); 228 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 229 /* 230 * Clear any possible leftover bits: 231 */ 232 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 233 } 234 propagate_user_return_notify(prev_p, next_p); 235 } 236 237 /* 238 * Idle related variables and functions 239 */ 240 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 241 EXPORT_SYMBOL(boot_option_idle_override); 242 243 static void (*x86_idle)(void); 244 245 #ifndef CONFIG_SMP 246 static inline void play_dead(void) 247 { 248 BUG(); 249 } 250 #endif 251 252 #ifdef CONFIG_X86_64 253 void enter_idle(void) 254 { 255 this_cpu_write(is_idle, 1); 256 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 257 } 258 259 static void __exit_idle(void) 260 { 261 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 262 return; 263 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 264 } 265 266 /* Called from interrupts to signify idle end */ 267 void exit_idle(void) 268 { 269 /* idle loop has pid 0 */ 270 if (current->pid) 271 return; 272 __exit_idle(); 273 } 274 #endif 275 276 void arch_cpu_idle_enter(void) 277 { 278 local_touch_nmi(); 279 enter_idle(); 280 } 281 282 void arch_cpu_idle_exit(void) 283 { 284 __exit_idle(); 285 } 286 287 void arch_cpu_idle_dead(void) 288 { 289 play_dead(); 290 } 291 292 /* 293 * Called from the generic idle code. 294 */ 295 void arch_cpu_idle(void) 296 { 297 x86_idle(); 298 } 299 300 /* 301 * We use this if we don't have any better idle routine.. 302 */ 303 void default_idle(void) 304 { 305 trace_cpu_idle_rcuidle(1, smp_processor_id()); 306 safe_halt(); 307 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 308 } 309 #ifdef CONFIG_APM_MODULE 310 EXPORT_SYMBOL(default_idle); 311 #endif 312 313 #ifdef CONFIG_XEN 314 bool xen_set_default_idle(void) 315 { 316 bool ret = !!x86_idle; 317 318 x86_idle = default_idle; 319 320 return ret; 321 } 322 #endif 323 void stop_this_cpu(void *dummy) 324 { 325 local_irq_disable(); 326 /* 327 * Remove this CPU: 328 */ 329 set_cpu_online(smp_processor_id(), false); 330 disable_local_APIC(); 331 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 332 333 for (;;) 334 halt(); 335 } 336 337 bool amd_e400_c1e_detected; 338 EXPORT_SYMBOL(amd_e400_c1e_detected); 339 340 static cpumask_var_t amd_e400_c1e_mask; 341 342 void amd_e400_remove_cpu(int cpu) 343 { 344 if (amd_e400_c1e_mask != NULL) 345 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 346 } 347 348 /* 349 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 350 * pending message MSR. If we detect C1E, then we handle it the same 351 * way as C3 power states (local apic timer and TSC stop) 352 */ 353 static void amd_e400_idle(void) 354 { 355 if (!amd_e400_c1e_detected) { 356 u32 lo, hi; 357 358 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 359 360 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 361 amd_e400_c1e_detected = true; 362 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 363 mark_tsc_unstable("TSC halt in AMD C1E"); 364 pr_info("System has AMD C1E enabled\n"); 365 } 366 } 367 368 if (amd_e400_c1e_detected) { 369 int cpu = smp_processor_id(); 370 371 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 372 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 373 /* Force broadcast so ACPI can not interfere. */ 374 tick_broadcast_force(); 375 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 376 } 377 tick_broadcast_enter(); 378 379 default_idle(); 380 381 /* 382 * The switch back from broadcast mode needs to be 383 * called with interrupts disabled. 384 */ 385 local_irq_disable(); 386 tick_broadcast_exit(); 387 local_irq_enable(); 388 } else 389 default_idle(); 390 } 391 392 /* 393 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 394 * We can't rely on cpuidle installing MWAIT, because it will not load 395 * on systems that support only C1 -- so the boot default must be MWAIT. 396 * 397 * Some AMD machines are the opposite, they depend on using HALT. 398 * 399 * So for default C1, which is used during boot until cpuidle loads, 400 * use MWAIT-C1 on Intel HW that has it, else use HALT. 401 */ 402 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 403 { 404 if (c->x86_vendor != X86_VENDOR_INTEL) 405 return 0; 406 407 if (!cpu_has(c, X86_FEATURE_MWAIT)) 408 return 0; 409 410 return 1; 411 } 412 413 /* 414 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 415 * with interrupts enabled and no flags, which is backwards compatible with the 416 * original MWAIT implementation. 417 */ 418 static void mwait_idle(void) 419 { 420 if (!current_set_polling_and_test()) { 421 trace_cpu_idle_rcuidle(1, smp_processor_id()); 422 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 423 mb(); /* quirk */ 424 clflush((void *)¤t_thread_info()->flags); 425 mb(); /* quirk */ 426 } 427 428 __monitor((void *)¤t_thread_info()->flags, 0, 0); 429 if (!need_resched()) 430 __sti_mwait(0, 0); 431 else 432 local_irq_enable(); 433 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 434 } else { 435 local_irq_enable(); 436 } 437 __current_clr_polling(); 438 } 439 440 void select_idle_routine(const struct cpuinfo_x86 *c) 441 { 442 #ifdef CONFIG_SMP 443 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 444 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 445 #endif 446 if (x86_idle || boot_option_idle_override == IDLE_POLL) 447 return; 448 449 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { 450 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 451 pr_info("using AMD E400 aware idle routine\n"); 452 x86_idle = amd_e400_idle; 453 } else if (prefer_mwait_c1_over_halt(c)) { 454 pr_info("using mwait in idle threads\n"); 455 x86_idle = mwait_idle; 456 } else 457 x86_idle = default_idle; 458 } 459 460 void __init init_amd_e400_c1e_mask(void) 461 { 462 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 463 if (x86_idle == amd_e400_idle) 464 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 465 } 466 467 static int __init idle_setup(char *str) 468 { 469 if (!str) 470 return -EINVAL; 471 472 if (!strcmp(str, "poll")) { 473 pr_info("using polling idle threads\n"); 474 boot_option_idle_override = IDLE_POLL; 475 cpu_idle_poll_ctrl(true); 476 } else if (!strcmp(str, "halt")) { 477 /* 478 * When the boot option of idle=halt is added, halt is 479 * forced to be used for CPU idle. In such case CPU C2/C3 480 * won't be used again. 481 * To continue to load the CPU idle driver, don't touch 482 * the boot_option_idle_override. 483 */ 484 x86_idle = default_idle; 485 boot_option_idle_override = IDLE_HALT; 486 } else if (!strcmp(str, "nomwait")) { 487 /* 488 * If the boot option of "idle=nomwait" is added, 489 * it means that mwait will be disabled for CPU C2/C3 490 * states. In such case it won't touch the variable 491 * of boot_option_idle_override. 492 */ 493 boot_option_idle_override = IDLE_NOMWAIT; 494 } else 495 return -1; 496 497 return 0; 498 } 499 early_param("idle", idle_setup); 500 501 unsigned long arch_align_stack(unsigned long sp) 502 { 503 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 504 sp -= get_random_int() % 8192; 505 return sp & ~0xf; 506 } 507 508 unsigned long arch_randomize_brk(struct mm_struct *mm) 509 { 510 unsigned long range_end = mm->brk + 0x02000000; 511 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 512 } 513 514 /* 515 * Called from fs/proc with a reference on @p to find the function 516 * which called into schedule(). This needs to be done carefully 517 * because the task might wake up and we might look at a stack 518 * changing under us. 519 */ 520 unsigned long get_wchan(struct task_struct *p) 521 { 522 unsigned long start, bottom, top, sp, fp, ip; 523 int count = 0; 524 525 if (!p || p == current || p->state == TASK_RUNNING) 526 return 0; 527 528 start = (unsigned long)task_stack_page(p); 529 if (!start) 530 return 0; 531 532 /* 533 * Layout of the stack page: 534 * 535 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 536 * PADDING 537 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 538 * stack 539 * ----------- bottom = start + sizeof(thread_info) 540 * thread_info 541 * ----------- start 542 * 543 * The tasks stack pointer points at the location where the 544 * framepointer is stored. The data on the stack is: 545 * ... IP FP ... IP FP 546 * 547 * We need to read FP and IP, so we need to adjust the upper 548 * bound by another unsigned long. 549 */ 550 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 551 top -= 2 * sizeof(unsigned long); 552 bottom = start + sizeof(struct thread_info); 553 554 sp = READ_ONCE(p->thread.sp); 555 if (sp < bottom || sp > top) 556 return 0; 557 558 fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); 559 do { 560 if (fp < bottom || fp > top) 561 return 0; 562 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); 563 if (!in_sched_functions(ip)) 564 return ip; 565 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); 566 } while (count++ < 16 && p->state != TASK_RUNNING); 567 return 0; 568 } 569