xref: /linux/arch/x86/kernel/process_64.c (revision 98b8788ae91694499d1995035625bea16a4db0c4)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/ds.h>
53 #include <asm/debugreg.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 DEFINE_PER_CPU(unsigned long, old_rsp);
58 static DEFINE_PER_CPU(unsigned char, is_idle);
59 
60 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
61 
62 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
63 
64 void idle_notifier_register(struct notifier_block *n)
65 {
66 	atomic_notifier_chain_register(&idle_notifier, n);
67 }
68 EXPORT_SYMBOL_GPL(idle_notifier_register);
69 
70 void idle_notifier_unregister(struct notifier_block *n)
71 {
72 	atomic_notifier_chain_unregister(&idle_notifier, n);
73 }
74 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
75 
76 void enter_idle(void)
77 {
78 	percpu_write(is_idle, 1);
79 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
80 }
81 
82 static void __exit_idle(void)
83 {
84 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
85 		return;
86 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
87 }
88 
89 /* Called from interrupts to signify idle end */
90 void exit_idle(void)
91 {
92 	/* idle loop has pid 0 */
93 	if (current->pid)
94 		return;
95 	__exit_idle();
96 }
97 
98 #ifndef CONFIG_SMP
99 static inline void play_dead(void)
100 {
101 	BUG();
102 }
103 #endif
104 
105 /*
106  * The idle thread. There's no useful work to be
107  * done, so just try to conserve power and have a
108  * low exit latency (ie sit in a loop waiting for
109  * somebody to say that they'd like to reschedule)
110  */
111 void cpu_idle(void)
112 {
113 	current_thread_info()->status |= TS_POLLING;
114 
115 	/*
116 	 * If we're the non-boot CPU, nothing set the stack canary up
117 	 * for us.  CPU0 already has it initialized but no harm in
118 	 * doing it again.  This is a good place for updating it, as
119 	 * we wont ever return from this function (so the invalid
120 	 * canaries already on the stack wont ever trigger).
121 	 */
122 	boot_init_stack_canary();
123 
124 	/* endless idle loop with no priority at all */
125 	while (1) {
126 		tick_nohz_stop_sched_tick(1);
127 		while (!need_resched()) {
128 
129 			rmb();
130 
131 			if (cpu_is_offline(smp_processor_id()))
132 				play_dead();
133 			/*
134 			 * Idle routines should keep interrupts disabled
135 			 * from here on, until they go to idle.
136 			 * Otherwise, idle callbacks can misfire.
137 			 */
138 			local_irq_disable();
139 			enter_idle();
140 			/* Don't trace irqs off for idle */
141 			stop_critical_timings();
142 			pm_idle();
143 			start_critical_timings();
144 			/* In many cases the interrupt that ended idle
145 			   has already called exit_idle. But some idle
146 			   loops can be woken up without interrupt. */
147 			__exit_idle();
148 		}
149 
150 		tick_nohz_restart_sched_tick();
151 		preempt_enable_no_resched();
152 		schedule();
153 		preempt_disable();
154 	}
155 }
156 
157 /* Prints also some state that isn't saved in the pt_regs */
158 void __show_regs(struct pt_regs *regs, int all)
159 {
160 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
161 	unsigned long d0, d1, d2, d3, d6, d7;
162 	unsigned int fsindex, gsindex;
163 	unsigned int ds, cs, es;
164 
165 	show_regs_common();
166 	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
167 	printk_address(regs->ip, 1);
168 	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
169 			regs->sp, regs->flags);
170 	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
171 	       regs->ax, regs->bx, regs->cx);
172 	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
173 	       regs->dx, regs->si, regs->di);
174 	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
175 	       regs->bp, regs->r8, regs->r9);
176 	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
177 	       regs->r10, regs->r11, regs->r12);
178 	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
179 	       regs->r13, regs->r14, regs->r15);
180 
181 	asm("movl %%ds,%0" : "=r" (ds));
182 	asm("movl %%cs,%0" : "=r" (cs));
183 	asm("movl %%es,%0" : "=r" (es));
184 	asm("movl %%fs,%0" : "=r" (fsindex));
185 	asm("movl %%gs,%0" : "=r" (gsindex));
186 
187 	rdmsrl(MSR_FS_BASE, fs);
188 	rdmsrl(MSR_GS_BASE, gs);
189 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
190 
191 	if (!all)
192 		return;
193 
194 	cr0 = read_cr0();
195 	cr2 = read_cr2();
196 	cr3 = read_cr3();
197 	cr4 = read_cr4();
198 
199 	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 	       fs, fsindex, gs, gsindex, shadowgs);
201 	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 			es, cr0);
203 	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
204 			cr4);
205 
206 	get_debugreg(d0, 0);
207 	get_debugreg(d1, 1);
208 	get_debugreg(d2, 2);
209 	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
210 	get_debugreg(d3, 3);
211 	get_debugreg(d6, 6);
212 	get_debugreg(d7, 7);
213 	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
214 }
215 
216 void show_regs(struct pt_regs *regs)
217 {
218 	show_registers(regs);
219 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
220 }
221 
222 void release_thread(struct task_struct *dead_task)
223 {
224 	if (dead_task->mm) {
225 		if (dead_task->mm->context.size) {
226 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
227 					dead_task->comm,
228 					dead_task->mm->context.ldt,
229 					dead_task->mm->context.size);
230 			BUG();
231 		}
232 	}
233 }
234 
235 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
236 {
237 	struct user_desc ud = {
238 		.base_addr = addr,
239 		.limit = 0xfffff,
240 		.seg_32bit = 1,
241 		.limit_in_pages = 1,
242 		.useable = 1,
243 	};
244 	struct desc_struct *desc = t->thread.tls_array;
245 	desc += tls;
246 	fill_ldt(desc, &ud);
247 }
248 
249 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
250 {
251 	return get_desc_base(&t->thread.tls_array[tls]);
252 }
253 
254 /*
255  * This gets called before we allocate a new thread and copy
256  * the current task into it.
257  */
258 void prepare_to_copy(struct task_struct *tsk)
259 {
260 	unlazy_fpu(tsk);
261 }
262 
263 int copy_thread(unsigned long clone_flags, unsigned long sp,
264 		unsigned long unused,
265 	struct task_struct *p, struct pt_regs *regs)
266 {
267 	int err;
268 	struct pt_regs *childregs;
269 	struct task_struct *me = current;
270 
271 	childregs = ((struct pt_regs *)
272 			(THREAD_SIZE + task_stack_page(p))) - 1;
273 	*childregs = *regs;
274 
275 	childregs->ax = 0;
276 	childregs->sp = sp;
277 	if (sp == ~0UL)
278 		childregs->sp = (unsigned long)childregs;
279 
280 	p->thread.sp = (unsigned long) childregs;
281 	p->thread.sp0 = (unsigned long) (childregs+1);
282 	p->thread.usersp = me->thread.usersp;
283 
284 	set_tsk_thread_flag(p, TIF_FORK);
285 
286 	p->thread.fs = me->thread.fs;
287 	p->thread.gs = me->thread.gs;
288 	p->thread.io_bitmap_ptr = NULL;
289 
290 	savesegment(gs, p->thread.gsindex);
291 	savesegment(fs, p->thread.fsindex);
292 	savesegment(es, p->thread.es);
293 	savesegment(ds, p->thread.ds);
294 
295 	err = -ENOMEM;
296 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
297 
298 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
299 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
300 		if (!p->thread.io_bitmap_ptr) {
301 			p->thread.io_bitmap_max = 0;
302 			return -ENOMEM;
303 		}
304 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
305 				IO_BITMAP_BYTES);
306 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
307 	}
308 
309 	/*
310 	 * Set a new TLS for the child thread?
311 	 */
312 	if (clone_flags & CLONE_SETTLS) {
313 #ifdef CONFIG_IA32_EMULATION
314 		if (test_thread_flag(TIF_IA32))
315 			err = do_set_thread_area(p, -1,
316 				(struct user_desc __user *)childregs->si, 0);
317 		else
318 #endif
319 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
320 		if (err)
321 			goto out;
322 	}
323 
324 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
325 	p->thread.ds_ctx = NULL;
326 
327 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
328 	p->thread.debugctlmsr = 0;
329 
330 	err = 0;
331 out:
332 	if (err && p->thread.io_bitmap_ptr) {
333 		kfree(p->thread.io_bitmap_ptr);
334 		p->thread.io_bitmap_max = 0;
335 	}
336 
337 	return err;
338 }
339 
340 static void
341 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
342 		    unsigned long new_sp,
343 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
344 {
345 	loadsegment(fs, 0);
346 	loadsegment(es, _ds);
347 	loadsegment(ds, _ds);
348 	load_gs_index(0);
349 	regs->ip		= new_ip;
350 	regs->sp		= new_sp;
351 	percpu_write(old_rsp, new_sp);
352 	regs->cs		= _cs;
353 	regs->ss		= _ss;
354 	regs->flags		= X86_EFLAGS_IF;
355 	set_fs(USER_DS);
356 	/*
357 	 * Free the old FP and other extended state
358 	 */
359 	free_thread_xstate(current);
360 }
361 
362 void
363 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
364 {
365 	start_thread_common(regs, new_ip, new_sp,
366 			    __USER_CS, __USER_DS, 0);
367 }
368 
369 #ifdef CONFIG_IA32_EMULATION
370 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
371 {
372 	start_thread_common(regs, new_ip, new_sp,
373 			    __USER32_CS, __USER32_DS, __USER32_DS);
374 }
375 #endif
376 
377 /*
378  *	switch_to(x,y) should switch tasks from x to y.
379  *
380  * This could still be optimized:
381  * - fold all the options into a flag word and test it with a single test.
382  * - could test fs/gs bitsliced
383  *
384  * Kprobes not supported here. Set the probe on schedule instead.
385  * Function graph tracer not supported too.
386  */
387 __notrace_funcgraph struct task_struct *
388 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
389 {
390 	struct thread_struct *prev = &prev_p->thread;
391 	struct thread_struct *next = &next_p->thread;
392 	int cpu = smp_processor_id();
393 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
394 	unsigned fsindex, gsindex;
395 	bool preload_fpu;
396 
397 	/*
398 	 * If the task has used fpu the last 5 timeslices, just do a full
399 	 * restore of the math state immediately to avoid the trap; the
400 	 * chances of needing FPU soon are obviously high now
401 	 */
402 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
403 
404 	/* we're going to use this soon, after a few expensive things */
405 	if (preload_fpu)
406 		prefetch(next->xstate);
407 
408 	/*
409 	 * Reload esp0, LDT and the page table pointer:
410 	 */
411 	load_sp0(tss, next);
412 
413 	/*
414 	 * Switch DS and ES.
415 	 * This won't pick up thread selector changes, but I guess that is ok.
416 	 */
417 	savesegment(es, prev->es);
418 	if (unlikely(next->es | prev->es))
419 		loadsegment(es, next->es);
420 
421 	savesegment(ds, prev->ds);
422 	if (unlikely(next->ds | prev->ds))
423 		loadsegment(ds, next->ds);
424 
425 
426 	/* We must save %fs and %gs before load_TLS() because
427 	 * %fs and %gs may be cleared by load_TLS().
428 	 *
429 	 * (e.g. xen_load_tls())
430 	 */
431 	savesegment(fs, fsindex);
432 	savesegment(gs, gsindex);
433 
434 	load_TLS(next, cpu);
435 
436 	/* Must be after DS reload */
437 	unlazy_fpu(prev_p);
438 
439 	/* Make sure cpu is ready for new context */
440 	if (preload_fpu)
441 		clts();
442 
443 	/*
444 	 * Leave lazy mode, flushing any hypercalls made here.
445 	 * This must be done before restoring TLS segments so
446 	 * the GDT and LDT are properly updated, and must be
447 	 * done before math_state_restore, so the TS bit is up
448 	 * to date.
449 	 */
450 	arch_end_context_switch(next_p);
451 
452 	/*
453 	 * Switch FS and GS.
454 	 *
455 	 * Segment register != 0 always requires a reload.  Also
456 	 * reload when it has changed.  When prev process used 64bit
457 	 * base always reload to avoid an information leak.
458 	 */
459 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
460 		loadsegment(fs, next->fsindex);
461 		/*
462 		 * Check if the user used a selector != 0; if yes
463 		 *  clear 64bit base, since overloaded base is always
464 		 *  mapped to the Null selector
465 		 */
466 		if (fsindex)
467 			prev->fs = 0;
468 	}
469 	/* when next process has a 64bit base use it */
470 	if (next->fs)
471 		wrmsrl(MSR_FS_BASE, next->fs);
472 	prev->fsindex = fsindex;
473 
474 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
475 		load_gs_index(next->gsindex);
476 		if (gsindex)
477 			prev->gs = 0;
478 	}
479 	if (next->gs)
480 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
481 	prev->gsindex = gsindex;
482 
483 	/*
484 	 * Switch the PDA and FPU contexts.
485 	 */
486 	prev->usersp = percpu_read(old_rsp);
487 	percpu_write(old_rsp, next->usersp);
488 	percpu_write(current_task, next_p);
489 
490 	percpu_write(kernel_stack,
491 		  (unsigned long)task_stack_page(next_p) +
492 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
493 
494 	/*
495 	 * Now maybe reload the debug registers and handle I/O bitmaps
496 	 */
497 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
498 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
499 		__switch_to_xtra(prev_p, next_p, tss);
500 
501 	/*
502 	 * Preload the FPU context, now that we've determined that the
503 	 * task is likely to be using it.
504 	 */
505 	if (preload_fpu)
506 		__math_state_restore();
507 
508 	return prev_p;
509 }
510 
511 /*
512  * sys_execve() executes a new program.
513  */
514 asmlinkage
515 long sys_execve(char __user *name, char __user * __user *argv,
516 		char __user * __user *envp, struct pt_regs *regs)
517 {
518 	long error;
519 	char *filename;
520 
521 	filename = getname(name);
522 	error = PTR_ERR(filename);
523 	if (IS_ERR(filename))
524 		return error;
525 	error = do_execve(filename, argv, envp, regs);
526 	putname(filename);
527 	return error;
528 }
529 
530 void set_personality_64bit(void)
531 {
532 	/* inherit personality from parent */
533 
534 	/* Make sure to be in 64bit mode */
535 	clear_thread_flag(TIF_IA32);
536 
537 	/* TBD: overwrites user setup. Should have two bits.
538 	   But 64bit processes have always behaved this way,
539 	   so it's not too bad. The main problem is just that
540 	   32bit childs are affected again. */
541 	current->personality &= ~READ_IMPLIES_EXEC;
542 }
543 
544 asmlinkage long
545 sys_clone(unsigned long clone_flags, unsigned long newsp,
546 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
547 {
548 	if (!newsp)
549 		newsp = regs->sp;
550 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
551 }
552 
553 unsigned long get_wchan(struct task_struct *p)
554 {
555 	unsigned long stack;
556 	u64 fp, ip;
557 	int count = 0;
558 
559 	if (!p || p == current || p->state == TASK_RUNNING)
560 		return 0;
561 	stack = (unsigned long)task_stack_page(p);
562 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
563 		return 0;
564 	fp = *(u64 *)(p->thread.sp);
565 	do {
566 		if (fp < (unsigned long)stack ||
567 		    fp >= (unsigned long)stack+THREAD_SIZE)
568 			return 0;
569 		ip = *(u64 *)(fp+8);
570 		if (!in_sched_functions(ip))
571 			return ip;
572 		fp = *(u64 *)fp;
573 	} while (count++ < 16);
574 	return 0;
575 }
576 
577 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
578 {
579 	int ret = 0;
580 	int doit = task == current;
581 	int cpu;
582 
583 	switch (code) {
584 	case ARCH_SET_GS:
585 		if (addr >= TASK_SIZE_OF(task))
586 			return -EPERM;
587 		cpu = get_cpu();
588 		/* handle small bases via the GDT because that's faster to
589 		   switch. */
590 		if (addr <= 0xffffffff) {
591 			set_32bit_tls(task, GS_TLS, addr);
592 			if (doit) {
593 				load_TLS(&task->thread, cpu);
594 				load_gs_index(GS_TLS_SEL);
595 			}
596 			task->thread.gsindex = GS_TLS_SEL;
597 			task->thread.gs = 0;
598 		} else {
599 			task->thread.gsindex = 0;
600 			task->thread.gs = addr;
601 			if (doit) {
602 				load_gs_index(0);
603 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
604 			}
605 		}
606 		put_cpu();
607 		break;
608 	case ARCH_SET_FS:
609 		/* Not strictly needed for fs, but do it for symmetry
610 		   with gs */
611 		if (addr >= TASK_SIZE_OF(task))
612 			return -EPERM;
613 		cpu = get_cpu();
614 		/* handle small bases via the GDT because that's faster to
615 		   switch. */
616 		if (addr <= 0xffffffff) {
617 			set_32bit_tls(task, FS_TLS, addr);
618 			if (doit) {
619 				load_TLS(&task->thread, cpu);
620 				loadsegment(fs, FS_TLS_SEL);
621 			}
622 			task->thread.fsindex = FS_TLS_SEL;
623 			task->thread.fs = 0;
624 		} else {
625 			task->thread.fsindex = 0;
626 			task->thread.fs = addr;
627 			if (doit) {
628 				/* set the selector to 0 to not confuse
629 				   __switch_to */
630 				loadsegment(fs, 0);
631 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
632 			}
633 		}
634 		put_cpu();
635 		break;
636 	case ARCH_GET_FS: {
637 		unsigned long base;
638 		if (task->thread.fsindex == FS_TLS_SEL)
639 			base = read_32bit_tls(task, FS_TLS);
640 		else if (doit)
641 			rdmsrl(MSR_FS_BASE, base);
642 		else
643 			base = task->thread.fs;
644 		ret = put_user(base, (unsigned long __user *)addr);
645 		break;
646 	}
647 	case ARCH_GET_GS: {
648 		unsigned long base;
649 		unsigned gsindex;
650 		if (task->thread.gsindex == GS_TLS_SEL)
651 			base = read_32bit_tls(task, GS_TLS);
652 		else if (doit) {
653 			savesegment(gs, gsindex);
654 			if (gsindex)
655 				rdmsrl(MSR_KERNEL_GS_BASE, base);
656 			else
657 				base = task->thread.gs;
658 		} else
659 			base = task->thread.gs;
660 		ret = put_user(base, (unsigned long __user *)addr);
661 		break;
662 	}
663 
664 	default:
665 		ret = -EINVAL;
666 		break;
667 	}
668 
669 	return ret;
670 }
671 
672 long sys_arch_prctl(int code, unsigned long addr)
673 {
674 	return do_arch_prctl(current, code, addr);
675 }
676 
677 unsigned long KSTK_ESP(struct task_struct *task)
678 {
679 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
680 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
681 }
682