1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/module.h> 11 #include <linux/pm.h> 12 #include <linux/tick.h> 13 #include <linux/random.h> 14 #include <linux/user-return-notifier.h> 15 #include <linux/dmi.h> 16 #include <linux/utsname.h> 17 #include <linux/stackprotector.h> 18 #include <linux/tick.h> 19 #include <linux/cpuidle.h> 20 #include <trace/events/power.h> 21 #include <linux/hw_breakpoint.h> 22 #include <asm/cpu.h> 23 #include <asm/apic.h> 24 #include <asm/syscalls.h> 25 #include <asm/idle.h> 26 #include <asm/uaccess.h> 27 #include <asm/mwait.h> 28 #include <asm/fpu/internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 #include <asm/tlbflush.h> 32 #include <asm/mce.h> 33 #include <asm/vm86.h> 34 35 /* 36 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 37 * no more per-task TSS's. The TSS size is kept cacheline-aligned 38 * so they are allowed to end up in the .data..cacheline_aligned 39 * section. Since TSS's are completely CPU-local, we want them 40 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 41 */ 42 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 43 .x86_tss = { 44 .sp0 = TOP_OF_INIT_STACK, 45 #ifdef CONFIG_X86_32 46 .ss0 = __KERNEL_DS, 47 .ss1 = __KERNEL_CS, 48 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 49 #endif 50 }, 51 #ifdef CONFIG_X86_32 52 /* 53 * Note that the .io_bitmap member must be extra-big. This is because 54 * the CPU will access an additional byte beyond the end of the IO 55 * permission bitmap. The extra byte must be all 1 bits, and must 56 * be within the limit. 57 */ 58 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 59 #endif 60 }; 61 EXPORT_PER_CPU_SYMBOL(cpu_tss); 62 63 #ifdef CONFIG_X86_64 64 static DEFINE_PER_CPU(unsigned char, is_idle); 65 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 66 67 void idle_notifier_register(struct notifier_block *n) 68 { 69 atomic_notifier_chain_register(&idle_notifier, n); 70 } 71 EXPORT_SYMBOL_GPL(idle_notifier_register); 72 73 void idle_notifier_unregister(struct notifier_block *n) 74 { 75 atomic_notifier_chain_unregister(&idle_notifier, n); 76 } 77 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 78 #endif 79 80 /* 81 * this gets called so that we can store lazy state into memory and copy the 82 * current task into the new thread. 83 */ 84 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 85 { 86 memcpy(dst, src, arch_task_struct_size); 87 #ifdef CONFIG_VM86 88 dst->thread.vm86 = NULL; 89 #endif 90 91 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 92 } 93 94 /* 95 * Free current thread data structures etc.. 96 */ 97 void exit_thread(void) 98 { 99 struct task_struct *me = current; 100 struct thread_struct *t = &me->thread; 101 unsigned long *bp = t->io_bitmap_ptr; 102 struct fpu *fpu = &t->fpu; 103 104 if (bp) { 105 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 106 107 t->io_bitmap_ptr = NULL; 108 clear_thread_flag(TIF_IO_BITMAP); 109 /* 110 * Careful, clear this in the TSS too: 111 */ 112 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 113 t->io_bitmap_max = 0; 114 put_cpu(); 115 kfree(bp); 116 } 117 118 free_vm86(t); 119 120 fpu__drop(fpu); 121 } 122 123 void flush_thread(void) 124 { 125 struct task_struct *tsk = current; 126 127 flush_ptrace_hw_breakpoint(tsk); 128 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 129 130 fpu__clear(&tsk->thread.fpu); 131 } 132 133 static void hard_disable_TSC(void) 134 { 135 cr4_set_bits(X86_CR4_TSD); 136 } 137 138 void disable_TSC(void) 139 { 140 preempt_disable(); 141 if (!test_and_set_thread_flag(TIF_NOTSC)) 142 /* 143 * Must flip the CPU state synchronously with 144 * TIF_NOTSC in the current running context. 145 */ 146 hard_disable_TSC(); 147 preempt_enable(); 148 } 149 150 static void hard_enable_TSC(void) 151 { 152 cr4_clear_bits(X86_CR4_TSD); 153 } 154 155 static void enable_TSC(void) 156 { 157 preempt_disable(); 158 if (test_and_clear_thread_flag(TIF_NOTSC)) 159 /* 160 * Must flip the CPU state synchronously with 161 * TIF_NOTSC in the current running context. 162 */ 163 hard_enable_TSC(); 164 preempt_enable(); 165 } 166 167 int get_tsc_mode(unsigned long adr) 168 { 169 unsigned int val; 170 171 if (test_thread_flag(TIF_NOTSC)) 172 val = PR_TSC_SIGSEGV; 173 else 174 val = PR_TSC_ENABLE; 175 176 return put_user(val, (unsigned int __user *)adr); 177 } 178 179 int set_tsc_mode(unsigned int val) 180 { 181 if (val == PR_TSC_SIGSEGV) 182 disable_TSC(); 183 else if (val == PR_TSC_ENABLE) 184 enable_TSC(); 185 else 186 return -EINVAL; 187 188 return 0; 189 } 190 191 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 192 struct tss_struct *tss) 193 { 194 struct thread_struct *prev, *next; 195 196 prev = &prev_p->thread; 197 next = &next_p->thread; 198 199 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 200 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 201 unsigned long debugctl = get_debugctlmsr(); 202 203 debugctl &= ~DEBUGCTLMSR_BTF; 204 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 205 debugctl |= DEBUGCTLMSR_BTF; 206 207 update_debugctlmsr(debugctl); 208 } 209 210 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 211 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 212 /* prev and next are different */ 213 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 214 hard_disable_TSC(); 215 else 216 hard_enable_TSC(); 217 } 218 219 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 220 /* 221 * Copy the relevant range of the IO bitmap. 222 * Normally this is 128 bytes or less: 223 */ 224 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 225 max(prev->io_bitmap_max, next->io_bitmap_max)); 226 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 227 /* 228 * Clear any possible leftover bits: 229 */ 230 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 231 } 232 propagate_user_return_notify(prev_p, next_p); 233 } 234 235 /* 236 * Idle related variables and functions 237 */ 238 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 239 EXPORT_SYMBOL(boot_option_idle_override); 240 241 static void (*x86_idle)(void); 242 243 #ifndef CONFIG_SMP 244 static inline void play_dead(void) 245 { 246 BUG(); 247 } 248 #endif 249 250 #ifdef CONFIG_X86_64 251 void enter_idle(void) 252 { 253 this_cpu_write(is_idle, 1); 254 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 255 } 256 257 static void __exit_idle(void) 258 { 259 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 260 return; 261 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 262 } 263 264 /* Called from interrupts to signify idle end */ 265 void exit_idle(void) 266 { 267 /* idle loop has pid 0 */ 268 if (current->pid) 269 return; 270 __exit_idle(); 271 } 272 #endif 273 274 void arch_cpu_idle_enter(void) 275 { 276 local_touch_nmi(); 277 enter_idle(); 278 } 279 280 void arch_cpu_idle_exit(void) 281 { 282 __exit_idle(); 283 } 284 285 void arch_cpu_idle_dead(void) 286 { 287 play_dead(); 288 } 289 290 /* 291 * Called from the generic idle code. 292 */ 293 void arch_cpu_idle(void) 294 { 295 x86_idle(); 296 } 297 298 /* 299 * We use this if we don't have any better idle routine.. 300 */ 301 void default_idle(void) 302 { 303 trace_cpu_idle_rcuidle(1, smp_processor_id()); 304 safe_halt(); 305 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 306 } 307 #ifdef CONFIG_APM_MODULE 308 EXPORT_SYMBOL(default_idle); 309 #endif 310 311 #ifdef CONFIG_XEN 312 bool xen_set_default_idle(void) 313 { 314 bool ret = !!x86_idle; 315 316 x86_idle = default_idle; 317 318 return ret; 319 } 320 #endif 321 void stop_this_cpu(void *dummy) 322 { 323 local_irq_disable(); 324 /* 325 * Remove this CPU: 326 */ 327 set_cpu_online(smp_processor_id(), false); 328 disable_local_APIC(); 329 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 330 331 for (;;) 332 halt(); 333 } 334 335 bool amd_e400_c1e_detected; 336 EXPORT_SYMBOL(amd_e400_c1e_detected); 337 338 static cpumask_var_t amd_e400_c1e_mask; 339 340 void amd_e400_remove_cpu(int cpu) 341 { 342 if (amd_e400_c1e_mask != NULL) 343 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 344 } 345 346 /* 347 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 348 * pending message MSR. If we detect C1E, then we handle it the same 349 * way as C3 power states (local apic timer and TSC stop) 350 */ 351 static void amd_e400_idle(void) 352 { 353 if (!amd_e400_c1e_detected) { 354 u32 lo, hi; 355 356 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 357 358 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 359 amd_e400_c1e_detected = true; 360 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 361 mark_tsc_unstable("TSC halt in AMD C1E"); 362 pr_info("System has AMD C1E enabled\n"); 363 } 364 } 365 366 if (amd_e400_c1e_detected) { 367 int cpu = smp_processor_id(); 368 369 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 370 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 371 /* Force broadcast so ACPI can not interfere. */ 372 tick_broadcast_force(); 373 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 374 } 375 tick_broadcast_enter(); 376 377 default_idle(); 378 379 /* 380 * The switch back from broadcast mode needs to be 381 * called with interrupts disabled. 382 */ 383 local_irq_disable(); 384 tick_broadcast_exit(); 385 local_irq_enable(); 386 } else 387 default_idle(); 388 } 389 390 /* 391 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 392 * We can't rely on cpuidle installing MWAIT, because it will not load 393 * on systems that support only C1 -- so the boot default must be MWAIT. 394 * 395 * Some AMD machines are the opposite, they depend on using HALT. 396 * 397 * So for default C1, which is used during boot until cpuidle loads, 398 * use MWAIT-C1 on Intel HW that has it, else use HALT. 399 */ 400 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 401 { 402 if (c->x86_vendor != X86_VENDOR_INTEL) 403 return 0; 404 405 if (!cpu_has(c, X86_FEATURE_MWAIT)) 406 return 0; 407 408 return 1; 409 } 410 411 /* 412 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 413 * with interrupts enabled and no flags, which is backwards compatible with the 414 * original MWAIT implementation. 415 */ 416 static void mwait_idle(void) 417 { 418 if (!current_set_polling_and_test()) { 419 trace_cpu_idle_rcuidle(1, smp_processor_id()); 420 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 421 smp_mb(); /* quirk */ 422 clflush((void *)¤t_thread_info()->flags); 423 smp_mb(); /* quirk */ 424 } 425 426 __monitor((void *)¤t_thread_info()->flags, 0, 0); 427 if (!need_resched()) 428 __sti_mwait(0, 0); 429 else 430 local_irq_enable(); 431 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 432 } else { 433 local_irq_enable(); 434 } 435 __current_clr_polling(); 436 } 437 438 void select_idle_routine(const struct cpuinfo_x86 *c) 439 { 440 #ifdef CONFIG_SMP 441 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 442 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 443 #endif 444 if (x86_idle || boot_option_idle_override == IDLE_POLL) 445 return; 446 447 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { 448 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 449 pr_info("using AMD E400 aware idle routine\n"); 450 x86_idle = amd_e400_idle; 451 } else if (prefer_mwait_c1_over_halt(c)) { 452 pr_info("using mwait in idle threads\n"); 453 x86_idle = mwait_idle; 454 } else 455 x86_idle = default_idle; 456 } 457 458 void __init init_amd_e400_c1e_mask(void) 459 { 460 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 461 if (x86_idle == amd_e400_idle) 462 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 463 } 464 465 static int __init idle_setup(char *str) 466 { 467 if (!str) 468 return -EINVAL; 469 470 if (!strcmp(str, "poll")) { 471 pr_info("using polling idle threads\n"); 472 boot_option_idle_override = IDLE_POLL; 473 cpu_idle_poll_ctrl(true); 474 } else if (!strcmp(str, "halt")) { 475 /* 476 * When the boot option of idle=halt is added, halt is 477 * forced to be used for CPU idle. In such case CPU C2/C3 478 * won't be used again. 479 * To continue to load the CPU idle driver, don't touch 480 * the boot_option_idle_override. 481 */ 482 x86_idle = default_idle; 483 boot_option_idle_override = IDLE_HALT; 484 } else if (!strcmp(str, "nomwait")) { 485 /* 486 * If the boot option of "idle=nomwait" is added, 487 * it means that mwait will be disabled for CPU C2/C3 488 * states. In such case it won't touch the variable 489 * of boot_option_idle_override. 490 */ 491 boot_option_idle_override = IDLE_NOMWAIT; 492 } else 493 return -1; 494 495 return 0; 496 } 497 early_param("idle", idle_setup); 498 499 unsigned long arch_align_stack(unsigned long sp) 500 { 501 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 502 sp -= get_random_int() % 8192; 503 return sp & ~0xf; 504 } 505 506 unsigned long arch_randomize_brk(struct mm_struct *mm) 507 { 508 unsigned long range_end = mm->brk + 0x02000000; 509 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 510 } 511 512 /* 513 * Called from fs/proc with a reference on @p to find the function 514 * which called into schedule(). This needs to be done carefully 515 * because the task might wake up and we might look at a stack 516 * changing under us. 517 */ 518 unsigned long get_wchan(struct task_struct *p) 519 { 520 unsigned long start, bottom, top, sp, fp, ip; 521 int count = 0; 522 523 if (!p || p == current || p->state == TASK_RUNNING) 524 return 0; 525 526 start = (unsigned long)task_stack_page(p); 527 if (!start) 528 return 0; 529 530 /* 531 * Layout of the stack page: 532 * 533 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 534 * PADDING 535 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 536 * stack 537 * ----------- bottom = start + sizeof(thread_info) 538 * thread_info 539 * ----------- start 540 * 541 * The tasks stack pointer points at the location where the 542 * framepointer is stored. The data on the stack is: 543 * ... IP FP ... IP FP 544 * 545 * We need to read FP and IP, so we need to adjust the upper 546 * bound by another unsigned long. 547 */ 548 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 549 top -= 2 * sizeof(unsigned long); 550 bottom = start + sizeof(struct thread_info); 551 552 sp = READ_ONCE(p->thread.sp); 553 if (sp < bottom || sp > top) 554 return 0; 555 556 fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); 557 do { 558 if (fp < bottom || fp > top) 559 return 0; 560 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); 561 if (!in_sched_functions(ip)) 562 return ip; 563 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); 564 } while (count++ < 16 && p->state != TASK_RUNNING); 565 return 0; 566 } 567