xref: /linux/arch/x86/kernel/process.c (revision cf2f33a4e54096f90652cca3511fd6a456ea5abe)
1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2 
3 #include <linux/errno.h>
4 #include <linux/kernel.h>
5 #include <linux/mm.h>
6 #include <linux/smp.h>
7 #include <linux/prctl.h>
8 #include <linux/slab.h>
9 #include <linux/sched.h>
10 #include <linux/module.h>
11 #include <linux/pm.h>
12 #include <linux/tick.h>
13 #include <linux/random.h>
14 #include <linux/user-return-notifier.h>
15 #include <linux/dmi.h>
16 #include <linux/utsname.h>
17 #include <linux/stackprotector.h>
18 #include <linux/tick.h>
19 #include <linux/cpuidle.h>
20 #include <trace/events/power.h>
21 #include <linux/hw_breakpoint.h>
22 #include <asm/cpu.h>
23 #include <asm/apic.h>
24 #include <asm/syscalls.h>
25 #include <asm/idle.h>
26 #include <asm/uaccess.h>
27 #include <asm/mwait.h>
28 #include <asm/fpu/internal.h>
29 #include <asm/debugreg.h>
30 #include <asm/nmi.h>
31 #include <asm/tlbflush.h>
32 #include <asm/mce.h>
33 
34 /*
35  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
36  * no more per-task TSS's. The TSS size is kept cacheline-aligned
37  * so they are allowed to end up in the .data..cacheline_aligned
38  * section. Since TSS's are completely CPU-local, we want them
39  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
40  */
41 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
42 	.x86_tss = {
43 		.sp0 = TOP_OF_INIT_STACK,
44 #ifdef CONFIG_X86_32
45 		.ss0 = __KERNEL_DS,
46 		.ss1 = __KERNEL_CS,
47 		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
48 #endif
49 	 },
50 #ifdef CONFIG_X86_32
51 	 /*
52 	  * Note that the .io_bitmap member must be extra-big. This is because
53 	  * the CPU will access an additional byte beyond the end of the IO
54 	  * permission bitmap. The extra byte must be all 1 bits, and must
55 	  * be within the limit.
56 	  */
57 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
58 #endif
59 };
60 EXPORT_PER_CPU_SYMBOL(cpu_tss);
61 
62 #ifdef CONFIG_X86_64
63 static DEFINE_PER_CPU(unsigned char, is_idle);
64 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
65 
66 void idle_notifier_register(struct notifier_block *n)
67 {
68 	atomic_notifier_chain_register(&idle_notifier, n);
69 }
70 EXPORT_SYMBOL_GPL(idle_notifier_register);
71 
72 void idle_notifier_unregister(struct notifier_block *n)
73 {
74 	atomic_notifier_chain_unregister(&idle_notifier, n);
75 }
76 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
77 #endif
78 
79 /*
80  * this gets called so that we can store lazy state into memory and copy the
81  * current task into the new thread.
82  */
83 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
84 {
85 	memcpy(dst, src, arch_task_struct_size);
86 
87 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
88 }
89 
90 /*
91  * Free current thread data structures etc..
92  */
93 void exit_thread(void)
94 {
95 	struct task_struct *me = current;
96 	struct thread_struct *t = &me->thread;
97 	unsigned long *bp = t->io_bitmap_ptr;
98 	struct fpu *fpu = &t->fpu;
99 
100 	if (bp) {
101 		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
102 
103 		t->io_bitmap_ptr = NULL;
104 		clear_thread_flag(TIF_IO_BITMAP);
105 		/*
106 		 * Careful, clear this in the TSS too:
107 		 */
108 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
109 		t->io_bitmap_max = 0;
110 		put_cpu();
111 		kfree(bp);
112 	}
113 
114 	fpu__drop(fpu);
115 }
116 
117 void flush_thread(void)
118 {
119 	struct task_struct *tsk = current;
120 
121 	flush_ptrace_hw_breakpoint(tsk);
122 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
123 
124 	fpu__clear(&tsk->thread.fpu);
125 }
126 
127 static void hard_disable_TSC(void)
128 {
129 	cr4_set_bits(X86_CR4_TSD);
130 }
131 
132 void disable_TSC(void)
133 {
134 	preempt_disable();
135 	if (!test_and_set_thread_flag(TIF_NOTSC))
136 		/*
137 		 * Must flip the CPU state synchronously with
138 		 * TIF_NOTSC in the current running context.
139 		 */
140 		hard_disable_TSC();
141 	preempt_enable();
142 }
143 
144 static void hard_enable_TSC(void)
145 {
146 	cr4_clear_bits(X86_CR4_TSD);
147 }
148 
149 static void enable_TSC(void)
150 {
151 	preempt_disable();
152 	if (test_and_clear_thread_flag(TIF_NOTSC))
153 		/*
154 		 * Must flip the CPU state synchronously with
155 		 * TIF_NOTSC in the current running context.
156 		 */
157 		hard_enable_TSC();
158 	preempt_enable();
159 }
160 
161 int get_tsc_mode(unsigned long adr)
162 {
163 	unsigned int val;
164 
165 	if (test_thread_flag(TIF_NOTSC))
166 		val = PR_TSC_SIGSEGV;
167 	else
168 		val = PR_TSC_ENABLE;
169 
170 	return put_user(val, (unsigned int __user *)adr);
171 }
172 
173 int set_tsc_mode(unsigned int val)
174 {
175 	if (val == PR_TSC_SIGSEGV)
176 		disable_TSC();
177 	else if (val == PR_TSC_ENABLE)
178 		enable_TSC();
179 	else
180 		return -EINVAL;
181 
182 	return 0;
183 }
184 
185 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
186 		      struct tss_struct *tss)
187 {
188 	struct thread_struct *prev, *next;
189 
190 	prev = &prev_p->thread;
191 	next = &next_p->thread;
192 
193 	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
194 	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
195 		unsigned long debugctl = get_debugctlmsr();
196 
197 		debugctl &= ~DEBUGCTLMSR_BTF;
198 		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
199 			debugctl |= DEBUGCTLMSR_BTF;
200 
201 		update_debugctlmsr(debugctl);
202 	}
203 
204 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
205 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
206 		/* prev and next are different */
207 		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
208 			hard_disable_TSC();
209 		else
210 			hard_enable_TSC();
211 	}
212 
213 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
214 		/*
215 		 * Copy the relevant range of the IO bitmap.
216 		 * Normally this is 128 bytes or less:
217 		 */
218 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
219 		       max(prev->io_bitmap_max, next->io_bitmap_max));
220 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
221 		/*
222 		 * Clear any possible leftover bits:
223 		 */
224 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
225 	}
226 	propagate_user_return_notify(prev_p, next_p);
227 }
228 
229 /*
230  * Idle related variables and functions
231  */
232 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
233 EXPORT_SYMBOL(boot_option_idle_override);
234 
235 static void (*x86_idle)(void);
236 
237 #ifndef CONFIG_SMP
238 static inline void play_dead(void)
239 {
240 	BUG();
241 }
242 #endif
243 
244 #ifdef CONFIG_X86_64
245 void enter_idle(void)
246 {
247 	this_cpu_write(is_idle, 1);
248 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
249 }
250 
251 static void __exit_idle(void)
252 {
253 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
254 		return;
255 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
256 }
257 
258 /* Called from interrupts to signify idle end */
259 void exit_idle(void)
260 {
261 	/* idle loop has pid 0 */
262 	if (current->pid)
263 		return;
264 	__exit_idle();
265 }
266 #endif
267 
268 void arch_cpu_idle_enter(void)
269 {
270 	local_touch_nmi();
271 	enter_idle();
272 }
273 
274 void arch_cpu_idle_exit(void)
275 {
276 	__exit_idle();
277 }
278 
279 void arch_cpu_idle_dead(void)
280 {
281 	play_dead();
282 }
283 
284 /*
285  * Called from the generic idle code.
286  */
287 void arch_cpu_idle(void)
288 {
289 	x86_idle();
290 }
291 
292 /*
293  * We use this if we don't have any better idle routine..
294  */
295 void default_idle(void)
296 {
297 	trace_cpu_idle_rcuidle(1, smp_processor_id());
298 	safe_halt();
299 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
300 }
301 #ifdef CONFIG_APM_MODULE
302 EXPORT_SYMBOL(default_idle);
303 #endif
304 
305 #ifdef CONFIG_XEN
306 bool xen_set_default_idle(void)
307 {
308 	bool ret = !!x86_idle;
309 
310 	x86_idle = default_idle;
311 
312 	return ret;
313 }
314 #endif
315 void stop_this_cpu(void *dummy)
316 {
317 	local_irq_disable();
318 	/*
319 	 * Remove this CPU:
320 	 */
321 	set_cpu_online(smp_processor_id(), false);
322 	disable_local_APIC();
323 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
324 
325 	for (;;)
326 		halt();
327 }
328 
329 bool amd_e400_c1e_detected;
330 EXPORT_SYMBOL(amd_e400_c1e_detected);
331 
332 static cpumask_var_t amd_e400_c1e_mask;
333 
334 void amd_e400_remove_cpu(int cpu)
335 {
336 	if (amd_e400_c1e_mask != NULL)
337 		cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
338 }
339 
340 /*
341  * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
342  * pending message MSR. If we detect C1E, then we handle it the same
343  * way as C3 power states (local apic timer and TSC stop)
344  */
345 static void amd_e400_idle(void)
346 {
347 	if (!amd_e400_c1e_detected) {
348 		u32 lo, hi;
349 
350 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
351 
352 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
353 			amd_e400_c1e_detected = true;
354 			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
355 				mark_tsc_unstable("TSC halt in AMD C1E");
356 			pr_info("System has AMD C1E enabled\n");
357 		}
358 	}
359 
360 	if (amd_e400_c1e_detected) {
361 		int cpu = smp_processor_id();
362 
363 		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
364 			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
365 			/* Force broadcast so ACPI can not interfere. */
366 			tick_broadcast_force();
367 			pr_info("Switch to broadcast mode on CPU%d\n", cpu);
368 		}
369 		tick_broadcast_enter();
370 
371 		default_idle();
372 
373 		/*
374 		 * The switch back from broadcast mode needs to be
375 		 * called with interrupts disabled.
376 		 */
377 		local_irq_disable();
378 		tick_broadcast_exit();
379 		local_irq_enable();
380 	} else
381 		default_idle();
382 }
383 
384 /*
385  * Intel Core2 and older machines prefer MWAIT over HALT for C1.
386  * We can't rely on cpuidle installing MWAIT, because it will not load
387  * on systems that support only C1 -- so the boot default must be MWAIT.
388  *
389  * Some AMD machines are the opposite, they depend on using HALT.
390  *
391  * So for default C1, which is used during boot until cpuidle loads,
392  * use MWAIT-C1 on Intel HW that has it, else use HALT.
393  */
394 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
395 {
396 	if (c->x86_vendor != X86_VENDOR_INTEL)
397 		return 0;
398 
399 	if (!cpu_has(c, X86_FEATURE_MWAIT))
400 		return 0;
401 
402 	return 1;
403 }
404 
405 /*
406  * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
407  * with interrupts enabled and no flags, which is backwards compatible with the
408  * original MWAIT implementation.
409  */
410 static void mwait_idle(void)
411 {
412 	if (!current_set_polling_and_test()) {
413 		trace_cpu_idle_rcuidle(1, smp_processor_id());
414 		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
415 			smp_mb(); /* quirk */
416 			clflush((void *)&current_thread_info()->flags);
417 			smp_mb(); /* quirk */
418 		}
419 
420 		__monitor((void *)&current_thread_info()->flags, 0, 0);
421 		if (!need_resched())
422 			__sti_mwait(0, 0);
423 		else
424 			local_irq_enable();
425 		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
426 	} else {
427 		local_irq_enable();
428 	}
429 	__current_clr_polling();
430 }
431 
432 void select_idle_routine(const struct cpuinfo_x86 *c)
433 {
434 #ifdef CONFIG_SMP
435 	if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
436 		pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
437 #endif
438 	if (x86_idle || boot_option_idle_override == IDLE_POLL)
439 		return;
440 
441 	if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
442 		/* E400: APIC timer interrupt does not wake up CPU from C1e */
443 		pr_info("using AMD E400 aware idle routine\n");
444 		x86_idle = amd_e400_idle;
445 	} else if (prefer_mwait_c1_over_halt(c)) {
446 		pr_info("using mwait in idle threads\n");
447 		x86_idle = mwait_idle;
448 	} else
449 		x86_idle = default_idle;
450 }
451 
452 void __init init_amd_e400_c1e_mask(void)
453 {
454 	/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
455 	if (x86_idle == amd_e400_idle)
456 		zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
457 }
458 
459 static int __init idle_setup(char *str)
460 {
461 	if (!str)
462 		return -EINVAL;
463 
464 	if (!strcmp(str, "poll")) {
465 		pr_info("using polling idle threads\n");
466 		boot_option_idle_override = IDLE_POLL;
467 		cpu_idle_poll_ctrl(true);
468 	} else if (!strcmp(str, "halt")) {
469 		/*
470 		 * When the boot option of idle=halt is added, halt is
471 		 * forced to be used for CPU idle. In such case CPU C2/C3
472 		 * won't be used again.
473 		 * To continue to load the CPU idle driver, don't touch
474 		 * the boot_option_idle_override.
475 		 */
476 		x86_idle = default_idle;
477 		boot_option_idle_override = IDLE_HALT;
478 	} else if (!strcmp(str, "nomwait")) {
479 		/*
480 		 * If the boot option of "idle=nomwait" is added,
481 		 * it means that mwait will be disabled for CPU C2/C3
482 		 * states. In such case it won't touch the variable
483 		 * of boot_option_idle_override.
484 		 */
485 		boot_option_idle_override = IDLE_NOMWAIT;
486 	} else
487 		return -1;
488 
489 	return 0;
490 }
491 early_param("idle", idle_setup);
492 
493 unsigned long arch_align_stack(unsigned long sp)
494 {
495 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
496 		sp -= get_random_int() % 8192;
497 	return sp & ~0xf;
498 }
499 
500 unsigned long arch_randomize_brk(struct mm_struct *mm)
501 {
502 	unsigned long range_end = mm->brk + 0x02000000;
503 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
504 }
505 
506