xref: /linux/arch/x86/kernel/process_64.c (revision b43ab901d671e3e3cad425ea5e9a3c74e266dcdd)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 #include <asm/nmi.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_idle_enter();
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_touch_nmi();
138 			local_irq_disable();
139 			enter_idle();
140 			/* Don't trace irqs off for idle */
141 			stop_critical_timings();
142 
143 			/* enter_idle() needs rcu for notifiers */
144 			rcu_idle_enter();
145 
146 			if (cpuidle_idle_call())
147 				pm_idle();
148 
149 			rcu_idle_exit();
150 			start_critical_timings();
151 
152 			/* In many cases the interrupt that ended idle
153 			   has already called exit_idle. But some idle
154 			   loops can be woken up without interrupt. */
155 			__exit_idle();
156 		}
157 
158 		tick_nohz_idle_exit();
159 		preempt_enable_no_resched();
160 		schedule();
161 		preempt_disable();
162 	}
163 }
164 
165 /* Prints also some state that isn't saved in the pt_regs */
166 void __show_regs(struct pt_regs *regs, int all)
167 {
168 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
169 	unsigned long d0, d1, d2, d3, d6, d7;
170 	unsigned int fsindex, gsindex;
171 	unsigned int ds, cs, es;
172 
173 	show_regs_common();
174 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
175 	printk_address(regs->ip, 1);
176 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
177 			regs->sp, regs->flags);
178 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
179 	       regs->ax, regs->bx, regs->cx);
180 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
181 	       regs->dx, regs->si, regs->di);
182 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
183 	       regs->bp, regs->r8, regs->r9);
184 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
185 	       regs->r10, regs->r11, regs->r12);
186 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
187 	       regs->r13, regs->r14, regs->r15);
188 
189 	asm("movl %%ds,%0" : "=r" (ds));
190 	asm("movl %%cs,%0" : "=r" (cs));
191 	asm("movl %%es,%0" : "=r" (es));
192 	asm("movl %%fs,%0" : "=r" (fsindex));
193 	asm("movl %%gs,%0" : "=r" (gsindex));
194 
195 	rdmsrl(MSR_FS_BASE, fs);
196 	rdmsrl(MSR_GS_BASE, gs);
197 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
198 
199 	if (!all)
200 		return;
201 
202 	cr0 = read_cr0();
203 	cr2 = read_cr2();
204 	cr3 = read_cr3();
205 	cr4 = read_cr4();
206 
207 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
208 	       fs, fsindex, gs, gsindex, shadowgs);
209 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
210 			es, cr0);
211 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
212 			cr4);
213 
214 	get_debugreg(d0, 0);
215 	get_debugreg(d1, 1);
216 	get_debugreg(d2, 2);
217 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
218 	get_debugreg(d3, 3);
219 	get_debugreg(d6, 6);
220 	get_debugreg(d7, 7);
221 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
222 }
223 
224 void release_thread(struct task_struct *dead_task)
225 {
226 	if (dead_task->mm) {
227 		if (dead_task->mm->context.size) {
228 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
229 					dead_task->comm,
230 					dead_task->mm->context.ldt,
231 					dead_task->mm->context.size);
232 			BUG();
233 		}
234 	}
235 }
236 
237 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
238 {
239 	struct user_desc ud = {
240 		.base_addr = addr,
241 		.limit = 0xfffff,
242 		.seg_32bit = 1,
243 		.limit_in_pages = 1,
244 		.useable = 1,
245 	};
246 	struct desc_struct *desc = t->thread.tls_array;
247 	desc += tls;
248 	fill_ldt(desc, &ud);
249 }
250 
251 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
252 {
253 	return get_desc_base(&t->thread.tls_array[tls]);
254 }
255 
256 /*
257  * This gets called before we allocate a new thread and copy
258  * the current task into it.
259  */
260 void prepare_to_copy(struct task_struct *tsk)
261 {
262 	unlazy_fpu(tsk);
263 }
264 
265 int copy_thread(unsigned long clone_flags, unsigned long sp,
266 		unsigned long unused,
267 	struct task_struct *p, struct pt_regs *regs)
268 {
269 	int err;
270 	struct pt_regs *childregs;
271 	struct task_struct *me = current;
272 
273 	childregs = ((struct pt_regs *)
274 			(THREAD_SIZE + task_stack_page(p))) - 1;
275 	*childregs = *regs;
276 
277 	childregs->ax = 0;
278 	if (user_mode(regs))
279 		childregs->sp = sp;
280 	else
281 		childregs->sp = (unsigned long)childregs;
282 
283 	p->thread.sp = (unsigned long) childregs;
284 	p->thread.sp0 = (unsigned long) (childregs+1);
285 	p->thread.usersp = me->thread.usersp;
286 
287 	set_tsk_thread_flag(p, TIF_FORK);
288 
289 	p->thread.io_bitmap_ptr = NULL;
290 
291 	savesegment(gs, p->thread.gsindex);
292 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
293 	savesegment(fs, p->thread.fsindex);
294 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
295 	savesegment(es, p->thread.es);
296 	savesegment(ds, p->thread.ds);
297 
298 	err = -ENOMEM;
299 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
300 
301 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
302 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
303 						  IO_BITMAP_BYTES, GFP_KERNEL);
304 		if (!p->thread.io_bitmap_ptr) {
305 			p->thread.io_bitmap_max = 0;
306 			return -ENOMEM;
307 		}
308 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
309 	}
310 
311 	/*
312 	 * Set a new TLS for the child thread?
313 	 */
314 	if (clone_flags & CLONE_SETTLS) {
315 #ifdef CONFIG_IA32_EMULATION
316 		if (test_thread_flag(TIF_IA32))
317 			err = do_set_thread_area(p, -1,
318 				(struct user_desc __user *)childregs->si, 0);
319 		else
320 #endif
321 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
322 		if (err)
323 			goto out;
324 	}
325 	err = 0;
326 out:
327 	if (err && p->thread.io_bitmap_ptr) {
328 		kfree(p->thread.io_bitmap_ptr);
329 		p->thread.io_bitmap_max = 0;
330 	}
331 
332 	return err;
333 }
334 
335 static void
336 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
337 		    unsigned long new_sp,
338 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
339 {
340 	loadsegment(fs, 0);
341 	loadsegment(es, _ds);
342 	loadsegment(ds, _ds);
343 	load_gs_index(0);
344 	regs->ip		= new_ip;
345 	regs->sp		= new_sp;
346 	percpu_write(old_rsp, new_sp);
347 	regs->cs		= _cs;
348 	regs->ss		= _ss;
349 	regs->flags		= X86_EFLAGS_IF;
350 	/*
351 	 * Free the old FP and other extended state
352 	 */
353 	free_thread_xstate(current);
354 }
355 
356 void
357 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
358 {
359 	start_thread_common(regs, new_ip, new_sp,
360 			    __USER_CS, __USER_DS, 0);
361 }
362 
363 #ifdef CONFIG_IA32_EMULATION
364 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
365 {
366 	start_thread_common(regs, new_ip, new_sp,
367 			    __USER32_CS, __USER32_DS, __USER32_DS);
368 }
369 #endif
370 
371 /*
372  *	switch_to(x,y) should switch tasks from x to y.
373  *
374  * This could still be optimized:
375  * - fold all the options into a flag word and test it with a single test.
376  * - could test fs/gs bitsliced
377  *
378  * Kprobes not supported here. Set the probe on schedule instead.
379  * Function graph tracer not supported too.
380  */
381 __notrace_funcgraph struct task_struct *
382 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
383 {
384 	struct thread_struct *prev = &prev_p->thread;
385 	struct thread_struct *next = &next_p->thread;
386 	int cpu = smp_processor_id();
387 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 	unsigned fsindex, gsindex;
389 	bool preload_fpu;
390 
391 	/*
392 	 * If the task has used fpu the last 5 timeslices, just do a full
393 	 * restore of the math state immediately to avoid the trap; the
394 	 * chances of needing FPU soon are obviously high now
395 	 */
396 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
397 
398 	/* we're going to use this soon, after a few expensive things */
399 	if (preload_fpu)
400 		prefetch(next->fpu.state);
401 
402 	/*
403 	 * Reload esp0, LDT and the page table pointer:
404 	 */
405 	load_sp0(tss, next);
406 
407 	/*
408 	 * Switch DS and ES.
409 	 * This won't pick up thread selector changes, but I guess that is ok.
410 	 */
411 	savesegment(es, prev->es);
412 	if (unlikely(next->es | prev->es))
413 		loadsegment(es, next->es);
414 
415 	savesegment(ds, prev->ds);
416 	if (unlikely(next->ds | prev->ds))
417 		loadsegment(ds, next->ds);
418 
419 
420 	/* We must save %fs and %gs before load_TLS() because
421 	 * %fs and %gs may be cleared by load_TLS().
422 	 *
423 	 * (e.g. xen_load_tls())
424 	 */
425 	savesegment(fs, fsindex);
426 	savesegment(gs, gsindex);
427 
428 	load_TLS(next, cpu);
429 
430 	/* Must be after DS reload */
431 	__unlazy_fpu(prev_p);
432 
433 	/* Make sure cpu is ready for new context */
434 	if (preload_fpu)
435 		clts();
436 
437 	/*
438 	 * Leave lazy mode, flushing any hypercalls made here.
439 	 * This must be done before restoring TLS segments so
440 	 * the GDT and LDT are properly updated, and must be
441 	 * done before math_state_restore, so the TS bit is up
442 	 * to date.
443 	 */
444 	arch_end_context_switch(next_p);
445 
446 	/*
447 	 * Switch FS and GS.
448 	 *
449 	 * Segment register != 0 always requires a reload.  Also
450 	 * reload when it has changed.  When prev process used 64bit
451 	 * base always reload to avoid an information leak.
452 	 */
453 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
454 		loadsegment(fs, next->fsindex);
455 		/*
456 		 * Check if the user used a selector != 0; if yes
457 		 *  clear 64bit base, since overloaded base is always
458 		 *  mapped to the Null selector
459 		 */
460 		if (fsindex)
461 			prev->fs = 0;
462 	}
463 	/* when next process has a 64bit base use it */
464 	if (next->fs)
465 		wrmsrl(MSR_FS_BASE, next->fs);
466 	prev->fsindex = fsindex;
467 
468 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
469 		load_gs_index(next->gsindex);
470 		if (gsindex)
471 			prev->gs = 0;
472 	}
473 	if (next->gs)
474 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
475 	prev->gsindex = gsindex;
476 
477 	/*
478 	 * Switch the PDA and FPU contexts.
479 	 */
480 	prev->usersp = percpu_read(old_rsp);
481 	percpu_write(old_rsp, next->usersp);
482 	percpu_write(current_task, next_p);
483 
484 	percpu_write(kernel_stack,
485 		  (unsigned long)task_stack_page(next_p) +
486 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
487 
488 	/*
489 	 * Now maybe reload the debug registers and handle I/O bitmaps
490 	 */
491 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
492 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
493 		__switch_to_xtra(prev_p, next_p, tss);
494 
495 	/*
496 	 * Preload the FPU context, now that we've determined that the
497 	 * task is likely to be using it.
498 	 */
499 	if (preload_fpu)
500 		__math_state_restore();
501 
502 	return prev_p;
503 }
504 
505 void set_personality_64bit(void)
506 {
507 	/* inherit personality from parent */
508 
509 	/* Make sure to be in 64bit mode */
510 	clear_thread_flag(TIF_IA32);
511 
512 	/* Ensure the corresponding mm is not marked. */
513 	if (current->mm)
514 		current->mm->context.ia32_compat = 0;
515 
516 	/* TBD: overwrites user setup. Should have two bits.
517 	   But 64bit processes have always behaved this way,
518 	   so it's not too bad. The main problem is just that
519 	   32bit childs are affected again. */
520 	current->personality &= ~READ_IMPLIES_EXEC;
521 }
522 
523 void set_personality_ia32(void)
524 {
525 	/* inherit personality from parent */
526 
527 	/* Make sure to be in 32bit mode */
528 	set_thread_flag(TIF_IA32);
529 	current->personality |= force_personality32;
530 
531 	/* Mark the associated mm as containing 32-bit tasks. */
532 	if (current->mm)
533 		current->mm->context.ia32_compat = 1;
534 
535 	/* Prepare the first "return" to user space */
536 	current_thread_info()->status |= TS_COMPAT;
537 }
538 
539 unsigned long get_wchan(struct task_struct *p)
540 {
541 	unsigned long stack;
542 	u64 fp, ip;
543 	int count = 0;
544 
545 	if (!p || p == current || p->state == TASK_RUNNING)
546 		return 0;
547 	stack = (unsigned long)task_stack_page(p);
548 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
549 		return 0;
550 	fp = *(u64 *)(p->thread.sp);
551 	do {
552 		if (fp < (unsigned long)stack ||
553 		    fp >= (unsigned long)stack+THREAD_SIZE)
554 			return 0;
555 		ip = *(u64 *)(fp+8);
556 		if (!in_sched_functions(ip))
557 			return ip;
558 		fp = *(u64 *)fp;
559 	} while (count++ < 16);
560 	return 0;
561 }
562 
563 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
564 {
565 	int ret = 0;
566 	int doit = task == current;
567 	int cpu;
568 
569 	switch (code) {
570 	case ARCH_SET_GS:
571 		if (addr >= TASK_SIZE_OF(task))
572 			return -EPERM;
573 		cpu = get_cpu();
574 		/* handle small bases via the GDT because that's faster to
575 		   switch. */
576 		if (addr <= 0xffffffff) {
577 			set_32bit_tls(task, GS_TLS, addr);
578 			if (doit) {
579 				load_TLS(&task->thread, cpu);
580 				load_gs_index(GS_TLS_SEL);
581 			}
582 			task->thread.gsindex = GS_TLS_SEL;
583 			task->thread.gs = 0;
584 		} else {
585 			task->thread.gsindex = 0;
586 			task->thread.gs = addr;
587 			if (doit) {
588 				load_gs_index(0);
589 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
590 			}
591 		}
592 		put_cpu();
593 		break;
594 	case ARCH_SET_FS:
595 		/* Not strictly needed for fs, but do it for symmetry
596 		   with gs */
597 		if (addr >= TASK_SIZE_OF(task))
598 			return -EPERM;
599 		cpu = get_cpu();
600 		/* handle small bases via the GDT because that's faster to
601 		   switch. */
602 		if (addr <= 0xffffffff) {
603 			set_32bit_tls(task, FS_TLS, addr);
604 			if (doit) {
605 				load_TLS(&task->thread, cpu);
606 				loadsegment(fs, FS_TLS_SEL);
607 			}
608 			task->thread.fsindex = FS_TLS_SEL;
609 			task->thread.fs = 0;
610 		} else {
611 			task->thread.fsindex = 0;
612 			task->thread.fs = addr;
613 			if (doit) {
614 				/* set the selector to 0 to not confuse
615 				   __switch_to */
616 				loadsegment(fs, 0);
617 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
618 			}
619 		}
620 		put_cpu();
621 		break;
622 	case ARCH_GET_FS: {
623 		unsigned long base;
624 		if (task->thread.fsindex == FS_TLS_SEL)
625 			base = read_32bit_tls(task, FS_TLS);
626 		else if (doit)
627 			rdmsrl(MSR_FS_BASE, base);
628 		else
629 			base = task->thread.fs;
630 		ret = put_user(base, (unsigned long __user *)addr);
631 		break;
632 	}
633 	case ARCH_GET_GS: {
634 		unsigned long base;
635 		unsigned gsindex;
636 		if (task->thread.gsindex == GS_TLS_SEL)
637 			base = read_32bit_tls(task, GS_TLS);
638 		else if (doit) {
639 			savesegment(gs, gsindex);
640 			if (gsindex)
641 				rdmsrl(MSR_KERNEL_GS_BASE, base);
642 			else
643 				base = task->thread.gs;
644 		} else
645 			base = task->thread.gs;
646 		ret = put_user(base, (unsigned long __user *)addr);
647 		break;
648 	}
649 
650 	default:
651 		ret = -EINVAL;
652 		break;
653 	}
654 
655 	return ret;
656 }
657 
658 long sys_arch_prctl(int code, unsigned long addr)
659 {
660 	return do_arch_prctl(current, code, addr);
661 }
662 
663 unsigned long KSTK_ESP(struct task_struct *task)
664 {
665 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
666 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
667 }
668