xref: /linux/arch/x86/kernel/process_64.c (revision 9ffc93f203c18a70623f21950f1dd473c9ec48cd)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/fpu-internal.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 #include <asm/nmi.h>
55 #include <asm/switch_to.h>
56 
57 asmlinkage extern void ret_from_fork(void);
58 
59 DEFINE_PER_CPU(unsigned long, old_rsp);
60 static DEFINE_PER_CPU(unsigned char, is_idle);
61 
62 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
63 
64 void idle_notifier_register(struct notifier_block *n)
65 {
66 	atomic_notifier_chain_register(&idle_notifier, n);
67 }
68 EXPORT_SYMBOL_GPL(idle_notifier_register);
69 
70 void idle_notifier_unregister(struct notifier_block *n)
71 {
72 	atomic_notifier_chain_unregister(&idle_notifier, n);
73 }
74 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
75 
76 void enter_idle(void)
77 {
78 	percpu_write(is_idle, 1);
79 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
80 }
81 
82 static void __exit_idle(void)
83 {
84 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
85 		return;
86 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
87 }
88 
89 /* Called from interrupts to signify idle end */
90 void exit_idle(void)
91 {
92 	/* idle loop has pid 0 */
93 	if (current->pid)
94 		return;
95 	__exit_idle();
96 }
97 
98 #ifndef CONFIG_SMP
99 static inline void play_dead(void)
100 {
101 	BUG();
102 }
103 #endif
104 
105 /*
106  * The idle thread. There's no useful work to be
107  * done, so just try to conserve power and have a
108  * low exit latency (ie sit in a loop waiting for
109  * somebody to say that they'd like to reschedule)
110  */
111 void cpu_idle(void)
112 {
113 	current_thread_info()->status |= TS_POLLING;
114 
115 	/*
116 	 * If we're the non-boot CPU, nothing set the stack canary up
117 	 * for us.  CPU0 already has it initialized but no harm in
118 	 * doing it again.  This is a good place for updating it, as
119 	 * we wont ever return from this function (so the invalid
120 	 * canaries already on the stack wont ever trigger).
121 	 */
122 	boot_init_stack_canary();
123 
124 	/* endless idle loop with no priority at all */
125 	while (1) {
126 		tick_nohz_idle_enter();
127 		while (!need_resched()) {
128 
129 			rmb();
130 
131 			if (cpu_is_offline(smp_processor_id()))
132 				play_dead();
133 			/*
134 			 * Idle routines should keep interrupts disabled
135 			 * from here on, until they go to idle.
136 			 * Otherwise, idle callbacks can misfire.
137 			 */
138 			local_touch_nmi();
139 			local_irq_disable();
140 			enter_idle();
141 			/* Don't trace irqs off for idle */
142 			stop_critical_timings();
143 
144 			/* enter_idle() needs rcu for notifiers */
145 			rcu_idle_enter();
146 
147 			if (cpuidle_idle_call())
148 				pm_idle();
149 
150 			rcu_idle_exit();
151 			start_critical_timings();
152 
153 			/* In many cases the interrupt that ended idle
154 			   has already called exit_idle. But some idle
155 			   loops can be woken up without interrupt. */
156 			__exit_idle();
157 		}
158 
159 		tick_nohz_idle_exit();
160 		schedule_preempt_disabled();
161 	}
162 }
163 
164 /* Prints also some state that isn't saved in the pt_regs */
165 void __show_regs(struct pt_regs *regs, int all)
166 {
167 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168 	unsigned long d0, d1, d2, d3, d6, d7;
169 	unsigned int fsindex, gsindex;
170 	unsigned int ds, cs, es;
171 
172 	show_regs_common();
173 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
174 	printk_address(regs->ip, 1);
175 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
176 			regs->sp, regs->flags);
177 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
178 	       regs->ax, regs->bx, regs->cx);
179 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
180 	       regs->dx, regs->si, regs->di);
181 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
182 	       regs->bp, regs->r8, regs->r9);
183 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
184 	       regs->r10, regs->r11, regs->r12);
185 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
186 	       regs->r13, regs->r14, regs->r15);
187 
188 	asm("movl %%ds,%0" : "=r" (ds));
189 	asm("movl %%cs,%0" : "=r" (cs));
190 	asm("movl %%es,%0" : "=r" (es));
191 	asm("movl %%fs,%0" : "=r" (fsindex));
192 	asm("movl %%gs,%0" : "=r" (gsindex));
193 
194 	rdmsrl(MSR_FS_BASE, fs);
195 	rdmsrl(MSR_GS_BASE, gs);
196 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
197 
198 	if (!all)
199 		return;
200 
201 	cr0 = read_cr0();
202 	cr2 = read_cr2();
203 	cr3 = read_cr3();
204 	cr4 = read_cr4();
205 
206 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
207 	       fs, fsindex, gs, gsindex, shadowgs);
208 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
209 			es, cr0);
210 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
211 			cr4);
212 
213 	get_debugreg(d0, 0);
214 	get_debugreg(d1, 1);
215 	get_debugreg(d2, 2);
216 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
217 	get_debugreg(d3, 3);
218 	get_debugreg(d6, 6);
219 	get_debugreg(d7, 7);
220 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
221 }
222 
223 void release_thread(struct task_struct *dead_task)
224 {
225 	if (dead_task->mm) {
226 		if (dead_task->mm->context.size) {
227 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
228 					dead_task->comm,
229 					dead_task->mm->context.ldt,
230 					dead_task->mm->context.size);
231 			BUG();
232 		}
233 	}
234 }
235 
236 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
237 {
238 	struct user_desc ud = {
239 		.base_addr = addr,
240 		.limit = 0xfffff,
241 		.seg_32bit = 1,
242 		.limit_in_pages = 1,
243 		.useable = 1,
244 	};
245 	struct desc_struct *desc = t->thread.tls_array;
246 	desc += tls;
247 	fill_ldt(desc, &ud);
248 }
249 
250 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
251 {
252 	return get_desc_base(&t->thread.tls_array[tls]);
253 }
254 
255 /*
256  * This gets called before we allocate a new thread and copy
257  * the current task into it.
258  */
259 void prepare_to_copy(struct task_struct *tsk)
260 {
261 	unlazy_fpu(tsk);
262 }
263 
264 int copy_thread(unsigned long clone_flags, unsigned long sp,
265 		unsigned long unused,
266 	struct task_struct *p, struct pt_regs *regs)
267 {
268 	int err;
269 	struct pt_regs *childregs;
270 	struct task_struct *me = current;
271 
272 	childregs = ((struct pt_regs *)
273 			(THREAD_SIZE + task_stack_page(p))) - 1;
274 	*childregs = *regs;
275 
276 	childregs->ax = 0;
277 	if (user_mode(regs))
278 		childregs->sp = sp;
279 	else
280 		childregs->sp = (unsigned long)childregs;
281 
282 	p->thread.sp = (unsigned long) childregs;
283 	p->thread.sp0 = (unsigned long) (childregs+1);
284 	p->thread.usersp = me->thread.usersp;
285 
286 	set_tsk_thread_flag(p, TIF_FORK);
287 
288 	p->fpu_counter = 0;
289 	p->thread.io_bitmap_ptr = NULL;
290 
291 	savesegment(gs, p->thread.gsindex);
292 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
293 	savesegment(fs, p->thread.fsindex);
294 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
295 	savesegment(es, p->thread.es);
296 	savesegment(ds, p->thread.ds);
297 
298 	err = -ENOMEM;
299 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
300 
301 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
302 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
303 						  IO_BITMAP_BYTES, GFP_KERNEL);
304 		if (!p->thread.io_bitmap_ptr) {
305 			p->thread.io_bitmap_max = 0;
306 			return -ENOMEM;
307 		}
308 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
309 	}
310 
311 	/*
312 	 * Set a new TLS for the child thread?
313 	 */
314 	if (clone_flags & CLONE_SETTLS) {
315 #ifdef CONFIG_IA32_EMULATION
316 		if (test_thread_flag(TIF_IA32))
317 			err = do_set_thread_area(p, -1,
318 				(struct user_desc __user *)childregs->si, 0);
319 		else
320 #endif
321 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
322 		if (err)
323 			goto out;
324 	}
325 	err = 0;
326 out:
327 	if (err && p->thread.io_bitmap_ptr) {
328 		kfree(p->thread.io_bitmap_ptr);
329 		p->thread.io_bitmap_max = 0;
330 	}
331 
332 	return err;
333 }
334 
335 static void
336 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
337 		    unsigned long new_sp,
338 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
339 {
340 	loadsegment(fs, 0);
341 	loadsegment(es, _ds);
342 	loadsegment(ds, _ds);
343 	load_gs_index(0);
344 	current->thread.usersp	= new_sp;
345 	regs->ip		= new_ip;
346 	regs->sp		= new_sp;
347 	percpu_write(old_rsp, new_sp);
348 	regs->cs		= _cs;
349 	regs->ss		= _ss;
350 	regs->flags		= X86_EFLAGS_IF;
351 	/*
352 	 * Free the old FP and other extended state
353 	 */
354 	free_thread_xstate(current);
355 }
356 
357 void
358 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
359 {
360 	start_thread_common(regs, new_ip, new_sp,
361 			    __USER_CS, __USER_DS, 0);
362 }
363 
364 #ifdef CONFIG_IA32_EMULATION
365 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
366 {
367 	start_thread_common(regs, new_ip, new_sp,
368 			    __USER32_CS, __USER32_DS, __USER32_DS);
369 }
370 #endif
371 
372 /*
373  *	switch_to(x,y) should switch tasks from x to y.
374  *
375  * This could still be optimized:
376  * - fold all the options into a flag word and test it with a single test.
377  * - could test fs/gs bitsliced
378  *
379  * Kprobes not supported here. Set the probe on schedule instead.
380  * Function graph tracer not supported too.
381  */
382 __notrace_funcgraph struct task_struct *
383 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
384 {
385 	struct thread_struct *prev = &prev_p->thread;
386 	struct thread_struct *next = &next_p->thread;
387 	int cpu = smp_processor_id();
388 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
389 	unsigned fsindex, gsindex;
390 	fpu_switch_t fpu;
391 
392 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
393 
394 	/*
395 	 * Reload esp0, LDT and the page table pointer:
396 	 */
397 	load_sp0(tss, next);
398 
399 	/*
400 	 * Switch DS and ES.
401 	 * This won't pick up thread selector changes, but I guess that is ok.
402 	 */
403 	savesegment(es, prev->es);
404 	if (unlikely(next->es | prev->es))
405 		loadsegment(es, next->es);
406 
407 	savesegment(ds, prev->ds);
408 	if (unlikely(next->ds | prev->ds))
409 		loadsegment(ds, next->ds);
410 
411 
412 	/* We must save %fs and %gs before load_TLS() because
413 	 * %fs and %gs may be cleared by load_TLS().
414 	 *
415 	 * (e.g. xen_load_tls())
416 	 */
417 	savesegment(fs, fsindex);
418 	savesegment(gs, gsindex);
419 
420 	load_TLS(next, cpu);
421 
422 	/*
423 	 * Leave lazy mode, flushing any hypercalls made here.
424 	 * This must be done before restoring TLS segments so
425 	 * the GDT and LDT are properly updated, and must be
426 	 * done before math_state_restore, so the TS bit is up
427 	 * to date.
428 	 */
429 	arch_end_context_switch(next_p);
430 
431 	/*
432 	 * Switch FS and GS.
433 	 *
434 	 * Segment register != 0 always requires a reload.  Also
435 	 * reload when it has changed.  When prev process used 64bit
436 	 * base always reload to avoid an information leak.
437 	 */
438 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
439 		loadsegment(fs, next->fsindex);
440 		/*
441 		 * Check if the user used a selector != 0; if yes
442 		 *  clear 64bit base, since overloaded base is always
443 		 *  mapped to the Null selector
444 		 */
445 		if (fsindex)
446 			prev->fs = 0;
447 	}
448 	/* when next process has a 64bit base use it */
449 	if (next->fs)
450 		wrmsrl(MSR_FS_BASE, next->fs);
451 	prev->fsindex = fsindex;
452 
453 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
454 		load_gs_index(next->gsindex);
455 		if (gsindex)
456 			prev->gs = 0;
457 	}
458 	if (next->gs)
459 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460 	prev->gsindex = gsindex;
461 
462 	switch_fpu_finish(next_p, fpu);
463 
464 	/*
465 	 * Switch the PDA and FPU contexts.
466 	 */
467 	prev->usersp = percpu_read(old_rsp);
468 	percpu_write(old_rsp, next->usersp);
469 	percpu_write(current_task, next_p);
470 
471 	percpu_write(kernel_stack,
472 		  (unsigned long)task_stack_page(next_p) +
473 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
474 
475 	/*
476 	 * Now maybe reload the debug registers and handle I/O bitmaps
477 	 */
478 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
479 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
480 		__switch_to_xtra(prev_p, next_p, tss);
481 
482 	return prev_p;
483 }
484 
485 void set_personality_64bit(void)
486 {
487 	/* inherit personality from parent */
488 
489 	/* Make sure to be in 64bit mode */
490 	clear_thread_flag(TIF_IA32);
491 
492 	/* Ensure the corresponding mm is not marked. */
493 	if (current->mm)
494 		current->mm->context.ia32_compat = 0;
495 
496 	/* TBD: overwrites user setup. Should have two bits.
497 	   But 64bit processes have always behaved this way,
498 	   so it's not too bad. The main problem is just that
499 	   32bit childs are affected again. */
500 	current->personality &= ~READ_IMPLIES_EXEC;
501 }
502 
503 void set_personality_ia32(void)
504 {
505 	/* inherit personality from parent */
506 
507 	/* Make sure to be in 32bit mode */
508 	set_thread_flag(TIF_IA32);
509 	current->personality |= force_personality32;
510 
511 	/* Mark the associated mm as containing 32-bit tasks. */
512 	if (current->mm)
513 		current->mm->context.ia32_compat = 1;
514 
515 	/* Prepare the first "return" to user space */
516 	current_thread_info()->status |= TS_COMPAT;
517 }
518 
519 unsigned long get_wchan(struct task_struct *p)
520 {
521 	unsigned long stack;
522 	u64 fp, ip;
523 	int count = 0;
524 
525 	if (!p || p == current || p->state == TASK_RUNNING)
526 		return 0;
527 	stack = (unsigned long)task_stack_page(p);
528 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
529 		return 0;
530 	fp = *(u64 *)(p->thread.sp);
531 	do {
532 		if (fp < (unsigned long)stack ||
533 		    fp >= (unsigned long)stack+THREAD_SIZE)
534 			return 0;
535 		ip = *(u64 *)(fp+8);
536 		if (!in_sched_functions(ip))
537 			return ip;
538 		fp = *(u64 *)fp;
539 	} while (count++ < 16);
540 	return 0;
541 }
542 
543 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
544 {
545 	int ret = 0;
546 	int doit = task == current;
547 	int cpu;
548 
549 	switch (code) {
550 	case ARCH_SET_GS:
551 		if (addr >= TASK_SIZE_OF(task))
552 			return -EPERM;
553 		cpu = get_cpu();
554 		/* handle small bases via the GDT because that's faster to
555 		   switch. */
556 		if (addr <= 0xffffffff) {
557 			set_32bit_tls(task, GS_TLS, addr);
558 			if (doit) {
559 				load_TLS(&task->thread, cpu);
560 				load_gs_index(GS_TLS_SEL);
561 			}
562 			task->thread.gsindex = GS_TLS_SEL;
563 			task->thread.gs = 0;
564 		} else {
565 			task->thread.gsindex = 0;
566 			task->thread.gs = addr;
567 			if (doit) {
568 				load_gs_index(0);
569 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
570 			}
571 		}
572 		put_cpu();
573 		break;
574 	case ARCH_SET_FS:
575 		/* Not strictly needed for fs, but do it for symmetry
576 		   with gs */
577 		if (addr >= TASK_SIZE_OF(task))
578 			return -EPERM;
579 		cpu = get_cpu();
580 		/* handle small bases via the GDT because that's faster to
581 		   switch. */
582 		if (addr <= 0xffffffff) {
583 			set_32bit_tls(task, FS_TLS, addr);
584 			if (doit) {
585 				load_TLS(&task->thread, cpu);
586 				loadsegment(fs, FS_TLS_SEL);
587 			}
588 			task->thread.fsindex = FS_TLS_SEL;
589 			task->thread.fs = 0;
590 		} else {
591 			task->thread.fsindex = 0;
592 			task->thread.fs = addr;
593 			if (doit) {
594 				/* set the selector to 0 to not confuse
595 				   __switch_to */
596 				loadsegment(fs, 0);
597 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
598 			}
599 		}
600 		put_cpu();
601 		break;
602 	case ARCH_GET_FS: {
603 		unsigned long base;
604 		if (task->thread.fsindex == FS_TLS_SEL)
605 			base = read_32bit_tls(task, FS_TLS);
606 		else if (doit)
607 			rdmsrl(MSR_FS_BASE, base);
608 		else
609 			base = task->thread.fs;
610 		ret = put_user(base, (unsigned long __user *)addr);
611 		break;
612 	}
613 	case ARCH_GET_GS: {
614 		unsigned long base;
615 		unsigned gsindex;
616 		if (task->thread.gsindex == GS_TLS_SEL)
617 			base = read_32bit_tls(task, GS_TLS);
618 		else if (doit) {
619 			savesegment(gs, gsindex);
620 			if (gsindex)
621 				rdmsrl(MSR_KERNEL_GS_BASE, base);
622 			else
623 				base = task->thread.gs;
624 		} else
625 			base = task->thread.gs;
626 		ret = put_user(base, (unsigned long __user *)addr);
627 		break;
628 	}
629 
630 	default:
631 		ret = -EINVAL;
632 		break;
633 	}
634 
635 	return ret;
636 }
637 
638 long sys_arch_prctl(int code, unsigned long addr)
639 {
640 	return do_arch_prctl(current, code, addr);
641 }
642 
643 unsigned long KSTK_ESP(struct task_struct *task)
644 {
645 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
646 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
647 }
648