xref: /linux/arch/x86/kernel/process_64.c (revision e35fa8c2d0feb977c2f7d14a973b4132483ffef3)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 #include <asm/nmi.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_idle_enter();
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_touch_nmi();
138 			local_irq_disable();
139 			enter_idle();
140 			/* Don't trace irqs off for idle */
141 			stop_critical_timings();
142 
143 			/* enter_idle() needs rcu for notifiers */
144 			rcu_idle_enter();
145 
146 			if (cpuidle_idle_call())
147 				pm_idle();
148 
149 			rcu_idle_exit();
150 			start_critical_timings();
151 
152 			/* In many cases the interrupt that ended idle
153 			   has already called exit_idle. But some idle
154 			   loops can be woken up without interrupt. */
155 			__exit_idle();
156 		}
157 
158 		tick_nohz_idle_exit();
159 		preempt_enable_no_resched();
160 		schedule();
161 		preempt_disable();
162 	}
163 }
164 
165 /* Prints also some state that isn't saved in the pt_regs */
166 void __show_regs(struct pt_regs *regs, int all)
167 {
168 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
169 	unsigned long d0, d1, d2, d3, d6, d7;
170 	unsigned int fsindex, gsindex;
171 	unsigned int ds, cs, es;
172 
173 	show_regs_common();
174 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
175 	printk_address(regs->ip, 1);
176 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
177 			regs->sp, regs->flags);
178 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
179 	       regs->ax, regs->bx, regs->cx);
180 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
181 	       regs->dx, regs->si, regs->di);
182 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
183 	       regs->bp, regs->r8, regs->r9);
184 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
185 	       regs->r10, regs->r11, regs->r12);
186 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
187 	       regs->r13, regs->r14, regs->r15);
188 
189 	asm("movl %%ds,%0" : "=r" (ds));
190 	asm("movl %%cs,%0" : "=r" (cs));
191 	asm("movl %%es,%0" : "=r" (es));
192 	asm("movl %%fs,%0" : "=r" (fsindex));
193 	asm("movl %%gs,%0" : "=r" (gsindex));
194 
195 	rdmsrl(MSR_FS_BASE, fs);
196 	rdmsrl(MSR_GS_BASE, gs);
197 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
198 
199 	if (!all)
200 		return;
201 
202 	cr0 = read_cr0();
203 	cr2 = read_cr2();
204 	cr3 = read_cr3();
205 	cr4 = read_cr4();
206 
207 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
208 	       fs, fsindex, gs, gsindex, shadowgs);
209 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
210 			es, cr0);
211 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
212 			cr4);
213 
214 	get_debugreg(d0, 0);
215 	get_debugreg(d1, 1);
216 	get_debugreg(d2, 2);
217 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
218 	get_debugreg(d3, 3);
219 	get_debugreg(d6, 6);
220 	get_debugreg(d7, 7);
221 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
222 }
223 
224 void release_thread(struct task_struct *dead_task)
225 {
226 	if (dead_task->mm) {
227 		if (dead_task->mm->context.size) {
228 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
229 					dead_task->comm,
230 					dead_task->mm->context.ldt,
231 					dead_task->mm->context.size);
232 			BUG();
233 		}
234 	}
235 }
236 
237 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
238 {
239 	struct user_desc ud = {
240 		.base_addr = addr,
241 		.limit = 0xfffff,
242 		.seg_32bit = 1,
243 		.limit_in_pages = 1,
244 		.useable = 1,
245 	};
246 	struct desc_struct *desc = t->thread.tls_array;
247 	desc += tls;
248 	fill_ldt(desc, &ud);
249 }
250 
251 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
252 {
253 	return get_desc_base(&t->thread.tls_array[tls]);
254 }
255 
256 /*
257  * This gets called before we allocate a new thread and copy
258  * the current task into it.
259  */
260 void prepare_to_copy(struct task_struct *tsk)
261 {
262 	unlazy_fpu(tsk);
263 }
264 
265 int copy_thread(unsigned long clone_flags, unsigned long sp,
266 		unsigned long unused,
267 	struct task_struct *p, struct pt_regs *regs)
268 {
269 	int err;
270 	struct pt_regs *childregs;
271 	struct task_struct *me = current;
272 
273 	childregs = ((struct pt_regs *)
274 			(THREAD_SIZE + task_stack_page(p))) - 1;
275 	*childregs = *regs;
276 
277 	childregs->ax = 0;
278 	if (user_mode(regs))
279 		childregs->sp = sp;
280 	else
281 		childregs->sp = (unsigned long)childregs;
282 
283 	p->thread.sp = (unsigned long) childregs;
284 	p->thread.sp0 = (unsigned long) (childregs+1);
285 	p->thread.usersp = me->thread.usersp;
286 
287 	set_tsk_thread_flag(p, TIF_FORK);
288 
289 	p->thread.io_bitmap_ptr = NULL;
290 
291 	savesegment(gs, p->thread.gsindex);
292 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
293 	savesegment(fs, p->thread.fsindex);
294 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
295 	savesegment(es, p->thread.es);
296 	savesegment(ds, p->thread.ds);
297 
298 	err = -ENOMEM;
299 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
300 
301 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
302 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
303 						  IO_BITMAP_BYTES, GFP_KERNEL);
304 		if (!p->thread.io_bitmap_ptr) {
305 			p->thread.io_bitmap_max = 0;
306 			return -ENOMEM;
307 		}
308 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
309 	}
310 
311 	/*
312 	 * Set a new TLS for the child thread?
313 	 */
314 	if (clone_flags & CLONE_SETTLS) {
315 #ifdef CONFIG_IA32_EMULATION
316 		if (test_thread_flag(TIF_IA32))
317 			err = do_set_thread_area(p, -1,
318 				(struct user_desc __user *)childregs->si, 0);
319 		else
320 #endif
321 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
322 		if (err)
323 			goto out;
324 	}
325 	err = 0;
326 out:
327 	if (err && p->thread.io_bitmap_ptr) {
328 		kfree(p->thread.io_bitmap_ptr);
329 		p->thread.io_bitmap_max = 0;
330 	}
331 
332 	return err;
333 }
334 
335 static void
336 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
337 		    unsigned long new_sp,
338 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
339 {
340 	loadsegment(fs, 0);
341 	loadsegment(es, _ds);
342 	loadsegment(ds, _ds);
343 	load_gs_index(0);
344 	regs->ip		= new_ip;
345 	regs->sp		= new_sp;
346 	percpu_write(old_rsp, new_sp);
347 	regs->cs		= _cs;
348 	regs->ss		= _ss;
349 	regs->flags		= X86_EFLAGS_IF;
350 	/*
351 	 * Free the old FP and other extended state
352 	 */
353 	free_thread_xstate(current);
354 }
355 
356 void
357 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
358 {
359 	start_thread_common(regs, new_ip, new_sp,
360 			    __USER_CS, __USER_DS, 0);
361 }
362 
363 #ifdef CONFIG_IA32_EMULATION
364 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
365 {
366 	start_thread_common(regs, new_ip, new_sp,
367 			    __USER32_CS, __USER32_DS, __USER32_DS);
368 }
369 #endif
370 
371 /*
372  *	switch_to(x,y) should switch tasks from x to y.
373  *
374  * This could still be optimized:
375  * - fold all the options into a flag word and test it with a single test.
376  * - could test fs/gs bitsliced
377  *
378  * Kprobes not supported here. Set the probe on schedule instead.
379  * Function graph tracer not supported too.
380  */
381 __notrace_funcgraph struct task_struct *
382 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
383 {
384 	struct thread_struct *prev = &prev_p->thread;
385 	struct thread_struct *next = &next_p->thread;
386 	int cpu = smp_processor_id();
387 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 	unsigned fsindex, gsindex;
389 	fpu_switch_t fpu;
390 
391 	fpu = switch_fpu_prepare(prev_p, next_p);
392 
393 	/*
394 	 * Reload esp0, LDT and the page table pointer:
395 	 */
396 	load_sp0(tss, next);
397 
398 	/*
399 	 * Switch DS and ES.
400 	 * This won't pick up thread selector changes, but I guess that is ok.
401 	 */
402 	savesegment(es, prev->es);
403 	if (unlikely(next->es | prev->es))
404 		loadsegment(es, next->es);
405 
406 	savesegment(ds, prev->ds);
407 	if (unlikely(next->ds | prev->ds))
408 		loadsegment(ds, next->ds);
409 
410 
411 	/* We must save %fs and %gs before load_TLS() because
412 	 * %fs and %gs may be cleared by load_TLS().
413 	 *
414 	 * (e.g. xen_load_tls())
415 	 */
416 	savesegment(fs, fsindex);
417 	savesegment(gs, gsindex);
418 
419 	load_TLS(next, cpu);
420 
421 	/*
422 	 * Leave lazy mode, flushing any hypercalls made here.
423 	 * This must be done before restoring TLS segments so
424 	 * the GDT and LDT are properly updated, and must be
425 	 * done before math_state_restore, so the TS bit is up
426 	 * to date.
427 	 */
428 	arch_end_context_switch(next_p);
429 
430 	/*
431 	 * Switch FS and GS.
432 	 *
433 	 * Segment register != 0 always requires a reload.  Also
434 	 * reload when it has changed.  When prev process used 64bit
435 	 * base always reload to avoid an information leak.
436 	 */
437 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
438 		loadsegment(fs, next->fsindex);
439 		/*
440 		 * Check if the user used a selector != 0; if yes
441 		 *  clear 64bit base, since overloaded base is always
442 		 *  mapped to the Null selector
443 		 */
444 		if (fsindex)
445 			prev->fs = 0;
446 	}
447 	/* when next process has a 64bit base use it */
448 	if (next->fs)
449 		wrmsrl(MSR_FS_BASE, next->fs);
450 	prev->fsindex = fsindex;
451 
452 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
453 		load_gs_index(next->gsindex);
454 		if (gsindex)
455 			prev->gs = 0;
456 	}
457 	if (next->gs)
458 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
459 	prev->gsindex = gsindex;
460 
461 	switch_fpu_finish(next_p, fpu);
462 
463 	/*
464 	 * Switch the PDA and FPU contexts.
465 	 */
466 	prev->usersp = percpu_read(old_rsp);
467 	percpu_write(old_rsp, next->usersp);
468 	percpu_write(current_task, next_p);
469 
470 	percpu_write(kernel_stack,
471 		  (unsigned long)task_stack_page(next_p) +
472 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
473 
474 	/*
475 	 * Now maybe reload the debug registers and handle I/O bitmaps
476 	 */
477 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
478 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
479 		__switch_to_xtra(prev_p, next_p, tss);
480 
481 	return prev_p;
482 }
483 
484 void set_personality_64bit(void)
485 {
486 	/* inherit personality from parent */
487 
488 	/* Make sure to be in 64bit mode */
489 	clear_thread_flag(TIF_IA32);
490 
491 	/* Ensure the corresponding mm is not marked. */
492 	if (current->mm)
493 		current->mm->context.ia32_compat = 0;
494 
495 	/* TBD: overwrites user setup. Should have two bits.
496 	   But 64bit processes have always behaved this way,
497 	   so it's not too bad. The main problem is just that
498 	   32bit childs are affected again. */
499 	current->personality &= ~READ_IMPLIES_EXEC;
500 }
501 
502 void set_personality_ia32(void)
503 {
504 	/* inherit personality from parent */
505 
506 	/* Make sure to be in 32bit mode */
507 	set_thread_flag(TIF_IA32);
508 	current->personality |= force_personality32;
509 
510 	/* Mark the associated mm as containing 32-bit tasks. */
511 	if (current->mm)
512 		current->mm->context.ia32_compat = 1;
513 
514 	/* Prepare the first "return" to user space */
515 	current_thread_info()->status |= TS_COMPAT;
516 }
517 
518 unsigned long get_wchan(struct task_struct *p)
519 {
520 	unsigned long stack;
521 	u64 fp, ip;
522 	int count = 0;
523 
524 	if (!p || p == current || p->state == TASK_RUNNING)
525 		return 0;
526 	stack = (unsigned long)task_stack_page(p);
527 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
528 		return 0;
529 	fp = *(u64 *)(p->thread.sp);
530 	do {
531 		if (fp < (unsigned long)stack ||
532 		    fp >= (unsigned long)stack+THREAD_SIZE)
533 			return 0;
534 		ip = *(u64 *)(fp+8);
535 		if (!in_sched_functions(ip))
536 			return ip;
537 		fp = *(u64 *)fp;
538 	} while (count++ < 16);
539 	return 0;
540 }
541 
542 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
543 {
544 	int ret = 0;
545 	int doit = task == current;
546 	int cpu;
547 
548 	switch (code) {
549 	case ARCH_SET_GS:
550 		if (addr >= TASK_SIZE_OF(task))
551 			return -EPERM;
552 		cpu = get_cpu();
553 		/* handle small bases via the GDT because that's faster to
554 		   switch. */
555 		if (addr <= 0xffffffff) {
556 			set_32bit_tls(task, GS_TLS, addr);
557 			if (doit) {
558 				load_TLS(&task->thread, cpu);
559 				load_gs_index(GS_TLS_SEL);
560 			}
561 			task->thread.gsindex = GS_TLS_SEL;
562 			task->thread.gs = 0;
563 		} else {
564 			task->thread.gsindex = 0;
565 			task->thread.gs = addr;
566 			if (doit) {
567 				load_gs_index(0);
568 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
569 			}
570 		}
571 		put_cpu();
572 		break;
573 	case ARCH_SET_FS:
574 		/* Not strictly needed for fs, but do it for symmetry
575 		   with gs */
576 		if (addr >= TASK_SIZE_OF(task))
577 			return -EPERM;
578 		cpu = get_cpu();
579 		/* handle small bases via the GDT because that's faster to
580 		   switch. */
581 		if (addr <= 0xffffffff) {
582 			set_32bit_tls(task, FS_TLS, addr);
583 			if (doit) {
584 				load_TLS(&task->thread, cpu);
585 				loadsegment(fs, FS_TLS_SEL);
586 			}
587 			task->thread.fsindex = FS_TLS_SEL;
588 			task->thread.fs = 0;
589 		} else {
590 			task->thread.fsindex = 0;
591 			task->thread.fs = addr;
592 			if (doit) {
593 				/* set the selector to 0 to not confuse
594 				   __switch_to */
595 				loadsegment(fs, 0);
596 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
597 			}
598 		}
599 		put_cpu();
600 		break;
601 	case ARCH_GET_FS: {
602 		unsigned long base;
603 		if (task->thread.fsindex == FS_TLS_SEL)
604 			base = read_32bit_tls(task, FS_TLS);
605 		else if (doit)
606 			rdmsrl(MSR_FS_BASE, base);
607 		else
608 			base = task->thread.fs;
609 		ret = put_user(base, (unsigned long __user *)addr);
610 		break;
611 	}
612 	case ARCH_GET_GS: {
613 		unsigned long base;
614 		unsigned gsindex;
615 		if (task->thread.gsindex == GS_TLS_SEL)
616 			base = read_32bit_tls(task, GS_TLS);
617 		else if (doit) {
618 			savesegment(gs, gsindex);
619 			if (gsindex)
620 				rdmsrl(MSR_KERNEL_GS_BASE, base);
621 			else
622 				base = task->thread.gs;
623 		} else
624 			base = task->thread.gs;
625 		ret = put_user(base, (unsigned long __user *)addr);
626 		break;
627 	}
628 
629 	default:
630 		ret = -EINVAL;
631 		break;
632 	}
633 
634 	return ret;
635 }
636 
637 long sys_arch_prctl(int code, unsigned long addr)
638 {
639 	return do_arch_prctl(current, code, addr);
640 }
641 
642 unsigned long KSTK_ESP(struct task_struct *task)
643 {
644 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
645 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
646 }
647