xref: /linux/arch/x86/kernel/process_64.c (revision db4e83957f961f9053282409c5062c6baef857a4)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 #include <asm/nmi.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_stop_sched_tick(1);
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_touch_nmi();
138 			local_irq_disable();
139 			enter_idle();
140 			/* Don't trace irqs off for idle */
141 			stop_critical_timings();
142 			if (cpuidle_idle_call())
143 				pm_idle();
144 			start_critical_timings();
145 
146 			/* In many cases the interrupt that ended idle
147 			   has already called exit_idle. But some idle
148 			   loops can be woken up without interrupt. */
149 			__exit_idle();
150 		}
151 
152 		tick_nohz_restart_sched_tick();
153 		preempt_enable_no_resched();
154 		schedule();
155 		preempt_disable();
156 	}
157 }
158 
159 /* Prints also some state that isn't saved in the pt_regs */
160 void __show_regs(struct pt_regs *regs, int all)
161 {
162 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163 	unsigned long d0, d1, d2, d3, d6, d7;
164 	unsigned int fsindex, gsindex;
165 	unsigned int ds, cs, es;
166 
167 	show_regs_common();
168 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169 	printk_address(regs->ip, 1);
170 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
171 			regs->sp, regs->flags);
172 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 	       regs->ax, regs->bx, regs->cx);
174 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 	       regs->dx, regs->si, regs->di);
176 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
177 	       regs->bp, regs->r8, regs->r9);
178 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
179 	       regs->r10, regs->r11, regs->r12);
180 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
181 	       regs->r13, regs->r14, regs->r15);
182 
183 	asm("movl %%ds,%0" : "=r" (ds));
184 	asm("movl %%cs,%0" : "=r" (cs));
185 	asm("movl %%es,%0" : "=r" (es));
186 	asm("movl %%fs,%0" : "=r" (fsindex));
187 	asm("movl %%gs,%0" : "=r" (gsindex));
188 
189 	rdmsrl(MSR_FS_BASE, fs);
190 	rdmsrl(MSR_GS_BASE, gs);
191 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
192 
193 	if (!all)
194 		return;
195 
196 	cr0 = read_cr0();
197 	cr2 = read_cr2();
198 	cr3 = read_cr3();
199 	cr4 = read_cr4();
200 
201 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
202 	       fs, fsindex, gs, gsindex, shadowgs);
203 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
204 			es, cr0);
205 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
206 			cr4);
207 
208 	get_debugreg(d0, 0);
209 	get_debugreg(d1, 1);
210 	get_debugreg(d2, 2);
211 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
212 	get_debugreg(d3, 3);
213 	get_debugreg(d6, 6);
214 	get_debugreg(d7, 7);
215 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
216 }
217 
218 void release_thread(struct task_struct *dead_task)
219 {
220 	if (dead_task->mm) {
221 		if (dead_task->mm->context.size) {
222 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
223 					dead_task->comm,
224 					dead_task->mm->context.ldt,
225 					dead_task->mm->context.size);
226 			BUG();
227 		}
228 	}
229 }
230 
231 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
232 {
233 	struct user_desc ud = {
234 		.base_addr = addr,
235 		.limit = 0xfffff,
236 		.seg_32bit = 1,
237 		.limit_in_pages = 1,
238 		.useable = 1,
239 	};
240 	struct desc_struct *desc = t->thread.tls_array;
241 	desc += tls;
242 	fill_ldt(desc, &ud);
243 }
244 
245 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
246 {
247 	return get_desc_base(&t->thread.tls_array[tls]);
248 }
249 
250 /*
251  * This gets called before we allocate a new thread and copy
252  * the current task into it.
253  */
254 void prepare_to_copy(struct task_struct *tsk)
255 {
256 	unlazy_fpu(tsk);
257 }
258 
259 int copy_thread(unsigned long clone_flags, unsigned long sp,
260 		unsigned long unused,
261 	struct task_struct *p, struct pt_regs *regs)
262 {
263 	int err;
264 	struct pt_regs *childregs;
265 	struct task_struct *me = current;
266 
267 	childregs = ((struct pt_regs *)
268 			(THREAD_SIZE + task_stack_page(p))) - 1;
269 	*childregs = *regs;
270 
271 	childregs->ax = 0;
272 	if (user_mode(regs))
273 		childregs->sp = sp;
274 	else
275 		childregs->sp = (unsigned long)childregs;
276 
277 	p->thread.sp = (unsigned long) childregs;
278 	p->thread.sp0 = (unsigned long) (childregs+1);
279 	p->thread.usersp = me->thread.usersp;
280 
281 	set_tsk_thread_flag(p, TIF_FORK);
282 
283 	p->thread.io_bitmap_ptr = NULL;
284 
285 	savesegment(gs, p->thread.gsindex);
286 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
287 	savesegment(fs, p->thread.fsindex);
288 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
289 	savesegment(es, p->thread.es);
290 	savesegment(ds, p->thread.ds);
291 
292 	err = -ENOMEM;
293 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
294 
295 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
296 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
297 		if (!p->thread.io_bitmap_ptr) {
298 			p->thread.io_bitmap_max = 0;
299 			return -ENOMEM;
300 		}
301 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
302 				IO_BITMAP_BYTES);
303 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
304 	}
305 
306 	/*
307 	 * Set a new TLS for the child thread?
308 	 */
309 	if (clone_flags & CLONE_SETTLS) {
310 #ifdef CONFIG_IA32_EMULATION
311 		if (test_thread_flag(TIF_IA32))
312 			err = do_set_thread_area(p, -1,
313 				(struct user_desc __user *)childregs->si, 0);
314 		else
315 #endif
316 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
317 		if (err)
318 			goto out;
319 	}
320 	err = 0;
321 out:
322 	if (err && p->thread.io_bitmap_ptr) {
323 		kfree(p->thread.io_bitmap_ptr);
324 		p->thread.io_bitmap_max = 0;
325 	}
326 
327 	return err;
328 }
329 
330 static void
331 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
332 		    unsigned long new_sp,
333 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
334 {
335 	loadsegment(fs, 0);
336 	loadsegment(es, _ds);
337 	loadsegment(ds, _ds);
338 	load_gs_index(0);
339 	regs->ip		= new_ip;
340 	regs->sp		= new_sp;
341 	percpu_write(old_rsp, new_sp);
342 	regs->cs		= _cs;
343 	regs->ss		= _ss;
344 	regs->flags		= X86_EFLAGS_IF;
345 	/*
346 	 * Free the old FP and other extended state
347 	 */
348 	free_thread_xstate(current);
349 }
350 
351 void
352 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
353 {
354 	start_thread_common(regs, new_ip, new_sp,
355 			    __USER_CS, __USER_DS, 0);
356 }
357 
358 #ifdef CONFIG_IA32_EMULATION
359 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
360 {
361 	start_thread_common(regs, new_ip, new_sp,
362 			    __USER32_CS, __USER32_DS, __USER32_DS);
363 }
364 #endif
365 
366 /*
367  *	switch_to(x,y) should switch tasks from x to y.
368  *
369  * This could still be optimized:
370  * - fold all the options into a flag word and test it with a single test.
371  * - could test fs/gs bitsliced
372  *
373  * Kprobes not supported here. Set the probe on schedule instead.
374  * Function graph tracer not supported too.
375  */
376 __notrace_funcgraph struct task_struct *
377 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
378 {
379 	struct thread_struct *prev = &prev_p->thread;
380 	struct thread_struct *next = &next_p->thread;
381 	int cpu = smp_processor_id();
382 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
383 	unsigned fsindex, gsindex;
384 	bool preload_fpu;
385 
386 	/*
387 	 * If the task has used fpu the last 5 timeslices, just do a full
388 	 * restore of the math state immediately to avoid the trap; the
389 	 * chances of needing FPU soon are obviously high now
390 	 */
391 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
392 
393 	/* we're going to use this soon, after a few expensive things */
394 	if (preload_fpu)
395 		prefetch(next->fpu.state);
396 
397 	/*
398 	 * Reload esp0, LDT and the page table pointer:
399 	 */
400 	load_sp0(tss, next);
401 
402 	/*
403 	 * Switch DS and ES.
404 	 * This won't pick up thread selector changes, but I guess that is ok.
405 	 */
406 	savesegment(es, prev->es);
407 	if (unlikely(next->es | prev->es))
408 		loadsegment(es, next->es);
409 
410 	savesegment(ds, prev->ds);
411 	if (unlikely(next->ds | prev->ds))
412 		loadsegment(ds, next->ds);
413 
414 
415 	/* We must save %fs and %gs before load_TLS() because
416 	 * %fs and %gs may be cleared by load_TLS().
417 	 *
418 	 * (e.g. xen_load_tls())
419 	 */
420 	savesegment(fs, fsindex);
421 	savesegment(gs, gsindex);
422 
423 	load_TLS(next, cpu);
424 
425 	/* Must be after DS reload */
426 	__unlazy_fpu(prev_p);
427 
428 	/* Make sure cpu is ready for new context */
429 	if (preload_fpu)
430 		clts();
431 
432 	/*
433 	 * Leave lazy mode, flushing any hypercalls made here.
434 	 * This must be done before restoring TLS segments so
435 	 * the GDT and LDT are properly updated, and must be
436 	 * done before math_state_restore, so the TS bit is up
437 	 * to date.
438 	 */
439 	arch_end_context_switch(next_p);
440 
441 	/*
442 	 * Switch FS and GS.
443 	 *
444 	 * Segment register != 0 always requires a reload.  Also
445 	 * reload when it has changed.  When prev process used 64bit
446 	 * base always reload to avoid an information leak.
447 	 */
448 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
449 		loadsegment(fs, next->fsindex);
450 		/*
451 		 * Check if the user used a selector != 0; if yes
452 		 *  clear 64bit base, since overloaded base is always
453 		 *  mapped to the Null selector
454 		 */
455 		if (fsindex)
456 			prev->fs = 0;
457 	}
458 	/* when next process has a 64bit base use it */
459 	if (next->fs)
460 		wrmsrl(MSR_FS_BASE, next->fs);
461 	prev->fsindex = fsindex;
462 
463 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
464 		load_gs_index(next->gsindex);
465 		if (gsindex)
466 			prev->gs = 0;
467 	}
468 	if (next->gs)
469 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
470 	prev->gsindex = gsindex;
471 
472 	/*
473 	 * Switch the PDA and FPU contexts.
474 	 */
475 	prev->usersp = percpu_read(old_rsp);
476 	percpu_write(old_rsp, next->usersp);
477 	percpu_write(current_task, next_p);
478 
479 	percpu_write(kernel_stack,
480 		  (unsigned long)task_stack_page(next_p) +
481 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
482 
483 	/*
484 	 * Now maybe reload the debug registers and handle I/O bitmaps
485 	 */
486 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
487 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
488 		__switch_to_xtra(prev_p, next_p, tss);
489 
490 	/*
491 	 * Preload the FPU context, now that we've determined that the
492 	 * task is likely to be using it.
493 	 */
494 	if (preload_fpu)
495 		__math_state_restore();
496 
497 	return prev_p;
498 }
499 
500 void set_personality_64bit(void)
501 {
502 	/* inherit personality from parent */
503 
504 	/* Make sure to be in 64bit mode */
505 	clear_thread_flag(TIF_IA32);
506 
507 	/* Ensure the corresponding mm is not marked. */
508 	if (current->mm)
509 		current->mm->context.ia32_compat = 0;
510 
511 	/* TBD: overwrites user setup. Should have two bits.
512 	   But 64bit processes have always behaved this way,
513 	   so it's not too bad. The main problem is just that
514 	   32bit childs are affected again. */
515 	current->personality &= ~READ_IMPLIES_EXEC;
516 }
517 
518 void set_personality_ia32(void)
519 {
520 	/* inherit personality from parent */
521 
522 	/* Make sure to be in 32bit mode */
523 	set_thread_flag(TIF_IA32);
524 	current->personality |= force_personality32;
525 
526 	/* Mark the associated mm as containing 32-bit tasks. */
527 	if (current->mm)
528 		current->mm->context.ia32_compat = 1;
529 
530 	/* Prepare the first "return" to user space */
531 	current_thread_info()->status |= TS_COMPAT;
532 }
533 
534 unsigned long get_wchan(struct task_struct *p)
535 {
536 	unsigned long stack;
537 	u64 fp, ip;
538 	int count = 0;
539 
540 	if (!p || p == current || p->state == TASK_RUNNING)
541 		return 0;
542 	stack = (unsigned long)task_stack_page(p);
543 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
544 		return 0;
545 	fp = *(u64 *)(p->thread.sp);
546 	do {
547 		if (fp < (unsigned long)stack ||
548 		    fp >= (unsigned long)stack+THREAD_SIZE)
549 			return 0;
550 		ip = *(u64 *)(fp+8);
551 		if (!in_sched_functions(ip))
552 			return ip;
553 		fp = *(u64 *)fp;
554 	} while (count++ < 16);
555 	return 0;
556 }
557 
558 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
559 {
560 	int ret = 0;
561 	int doit = task == current;
562 	int cpu;
563 
564 	switch (code) {
565 	case ARCH_SET_GS:
566 		if (addr >= TASK_SIZE_OF(task))
567 			return -EPERM;
568 		cpu = get_cpu();
569 		/* handle small bases via the GDT because that's faster to
570 		   switch. */
571 		if (addr <= 0xffffffff) {
572 			set_32bit_tls(task, GS_TLS, addr);
573 			if (doit) {
574 				load_TLS(&task->thread, cpu);
575 				load_gs_index(GS_TLS_SEL);
576 			}
577 			task->thread.gsindex = GS_TLS_SEL;
578 			task->thread.gs = 0;
579 		} else {
580 			task->thread.gsindex = 0;
581 			task->thread.gs = addr;
582 			if (doit) {
583 				load_gs_index(0);
584 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
585 			}
586 		}
587 		put_cpu();
588 		break;
589 	case ARCH_SET_FS:
590 		/* Not strictly needed for fs, but do it for symmetry
591 		   with gs */
592 		if (addr >= TASK_SIZE_OF(task))
593 			return -EPERM;
594 		cpu = get_cpu();
595 		/* handle small bases via the GDT because that's faster to
596 		   switch. */
597 		if (addr <= 0xffffffff) {
598 			set_32bit_tls(task, FS_TLS, addr);
599 			if (doit) {
600 				load_TLS(&task->thread, cpu);
601 				loadsegment(fs, FS_TLS_SEL);
602 			}
603 			task->thread.fsindex = FS_TLS_SEL;
604 			task->thread.fs = 0;
605 		} else {
606 			task->thread.fsindex = 0;
607 			task->thread.fs = addr;
608 			if (doit) {
609 				/* set the selector to 0 to not confuse
610 				   __switch_to */
611 				loadsegment(fs, 0);
612 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
613 			}
614 		}
615 		put_cpu();
616 		break;
617 	case ARCH_GET_FS: {
618 		unsigned long base;
619 		if (task->thread.fsindex == FS_TLS_SEL)
620 			base = read_32bit_tls(task, FS_TLS);
621 		else if (doit)
622 			rdmsrl(MSR_FS_BASE, base);
623 		else
624 			base = task->thread.fs;
625 		ret = put_user(base, (unsigned long __user *)addr);
626 		break;
627 	}
628 	case ARCH_GET_GS: {
629 		unsigned long base;
630 		unsigned gsindex;
631 		if (task->thread.gsindex == GS_TLS_SEL)
632 			base = read_32bit_tls(task, GS_TLS);
633 		else if (doit) {
634 			savesegment(gs, gsindex);
635 			if (gsindex)
636 				rdmsrl(MSR_KERNEL_GS_BASE, base);
637 			else
638 				base = task->thread.gs;
639 		} else
640 			base = task->thread.gs;
641 		ret = put_user(base, (unsigned long __user *)addr);
642 		break;
643 	}
644 
645 	default:
646 		ret = -EINVAL;
647 		break;
648 	}
649 
650 	return ret;
651 }
652 
653 long sys_arch_prctl(int code, unsigned long addr)
654 {
655 	return do_arch_prctl(current, code, addr);
656 }
657 
658 unsigned long KSTK_ESP(struct task_struct *task)
659 {
660 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
661 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
662 }
663