xref: /linux/arch/x86/kernel/process_64.c (revision bd2f55361f18347e890d52ff9cfd8895455ec11b)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 #include <linux/cpuidle.h>
41 
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
51 #include <asm/idle.h>
52 #include <asm/syscalls.h>
53 #include <asm/debugreg.h>
54 #include <asm/nmi.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_idle_enter();
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_touch_nmi();
138 			local_irq_disable();
139 			enter_idle();
140 			/* Don't trace irqs off for idle */
141 			stop_critical_timings();
142 
143 			/* enter_idle() needs rcu for notifiers */
144 			rcu_idle_enter();
145 
146 			if (cpuidle_idle_call())
147 				pm_idle();
148 
149 			rcu_idle_exit();
150 			start_critical_timings();
151 
152 			/* In many cases the interrupt that ended idle
153 			   has already called exit_idle. But some idle
154 			   loops can be woken up without interrupt. */
155 			__exit_idle();
156 		}
157 
158 		tick_nohz_idle_exit();
159 		schedule_preempt_disabled();
160 	}
161 }
162 
163 /* Prints also some state that isn't saved in the pt_regs */
164 void __show_regs(struct pt_regs *regs, int all)
165 {
166 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
167 	unsigned long d0, d1, d2, d3, d6, d7;
168 	unsigned int fsindex, gsindex;
169 	unsigned int ds, cs, es;
170 
171 	show_regs_common();
172 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
173 	printk_address(regs->ip, 1);
174 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
175 			regs->sp, regs->flags);
176 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
177 	       regs->ax, regs->bx, regs->cx);
178 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
179 	       regs->dx, regs->si, regs->di);
180 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
181 	       regs->bp, regs->r8, regs->r9);
182 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
183 	       regs->r10, regs->r11, regs->r12);
184 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
185 	       regs->r13, regs->r14, regs->r15);
186 
187 	asm("movl %%ds,%0" : "=r" (ds));
188 	asm("movl %%cs,%0" : "=r" (cs));
189 	asm("movl %%es,%0" : "=r" (es));
190 	asm("movl %%fs,%0" : "=r" (fsindex));
191 	asm("movl %%gs,%0" : "=r" (gsindex));
192 
193 	rdmsrl(MSR_FS_BASE, fs);
194 	rdmsrl(MSR_GS_BASE, gs);
195 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
196 
197 	if (!all)
198 		return;
199 
200 	cr0 = read_cr0();
201 	cr2 = read_cr2();
202 	cr3 = read_cr3();
203 	cr4 = read_cr4();
204 
205 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
206 	       fs, fsindex, gs, gsindex, shadowgs);
207 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
208 			es, cr0);
209 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
210 			cr4);
211 
212 	get_debugreg(d0, 0);
213 	get_debugreg(d1, 1);
214 	get_debugreg(d2, 2);
215 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
216 	get_debugreg(d3, 3);
217 	get_debugreg(d6, 6);
218 	get_debugreg(d7, 7);
219 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
220 }
221 
222 void release_thread(struct task_struct *dead_task)
223 {
224 	if (dead_task->mm) {
225 		if (dead_task->mm->context.size) {
226 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
227 					dead_task->comm,
228 					dead_task->mm->context.ldt,
229 					dead_task->mm->context.size);
230 			BUG();
231 		}
232 	}
233 }
234 
235 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
236 {
237 	struct user_desc ud = {
238 		.base_addr = addr,
239 		.limit = 0xfffff,
240 		.seg_32bit = 1,
241 		.limit_in_pages = 1,
242 		.useable = 1,
243 	};
244 	struct desc_struct *desc = t->thread.tls_array;
245 	desc += tls;
246 	fill_ldt(desc, &ud);
247 }
248 
249 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
250 {
251 	return get_desc_base(&t->thread.tls_array[tls]);
252 }
253 
254 /*
255  * This gets called before we allocate a new thread and copy
256  * the current task into it.
257  */
258 void prepare_to_copy(struct task_struct *tsk)
259 {
260 	unlazy_fpu(tsk);
261 }
262 
263 int copy_thread(unsigned long clone_flags, unsigned long sp,
264 		unsigned long unused,
265 	struct task_struct *p, struct pt_regs *regs)
266 {
267 	int err;
268 	struct pt_regs *childregs;
269 	struct task_struct *me = current;
270 
271 	childregs = ((struct pt_regs *)
272 			(THREAD_SIZE + task_stack_page(p))) - 1;
273 	*childregs = *regs;
274 
275 	childregs->ax = 0;
276 	if (user_mode(regs))
277 		childregs->sp = sp;
278 	else
279 		childregs->sp = (unsigned long)childregs;
280 
281 	p->thread.sp = (unsigned long) childregs;
282 	p->thread.sp0 = (unsigned long) (childregs+1);
283 	p->thread.usersp = me->thread.usersp;
284 
285 	set_tsk_thread_flag(p, TIF_FORK);
286 
287 	p->fpu_counter = 0;
288 	p->thread.io_bitmap_ptr = NULL;
289 
290 	savesegment(gs, p->thread.gsindex);
291 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
292 	savesegment(fs, p->thread.fsindex);
293 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
294 	savesegment(es, p->thread.es);
295 	savesegment(ds, p->thread.ds);
296 
297 	err = -ENOMEM;
298 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
299 
300 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
301 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
302 						  IO_BITMAP_BYTES, GFP_KERNEL);
303 		if (!p->thread.io_bitmap_ptr) {
304 			p->thread.io_bitmap_max = 0;
305 			return -ENOMEM;
306 		}
307 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
308 	}
309 
310 	/*
311 	 * Set a new TLS for the child thread?
312 	 */
313 	if (clone_flags & CLONE_SETTLS) {
314 #ifdef CONFIG_IA32_EMULATION
315 		if (test_thread_flag(TIF_IA32))
316 			err = do_set_thread_area(p, -1,
317 				(struct user_desc __user *)childregs->si, 0);
318 		else
319 #endif
320 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
321 		if (err)
322 			goto out;
323 	}
324 	err = 0;
325 out:
326 	if (err && p->thread.io_bitmap_ptr) {
327 		kfree(p->thread.io_bitmap_ptr);
328 		p->thread.io_bitmap_max = 0;
329 	}
330 
331 	return err;
332 }
333 
334 static void
335 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
336 		    unsigned long new_sp,
337 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
338 {
339 	loadsegment(fs, 0);
340 	loadsegment(es, _ds);
341 	loadsegment(ds, _ds);
342 	load_gs_index(0);
343 	regs->ip		= new_ip;
344 	regs->sp		= new_sp;
345 	percpu_write(old_rsp, new_sp);
346 	regs->cs		= _cs;
347 	regs->ss		= _ss;
348 	regs->flags		= X86_EFLAGS_IF;
349 	/*
350 	 * Free the old FP and other extended state
351 	 */
352 	free_thread_xstate(current);
353 }
354 
355 void
356 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
357 {
358 	start_thread_common(regs, new_ip, new_sp,
359 			    __USER_CS, __USER_DS, 0);
360 }
361 
362 #ifdef CONFIG_IA32_EMULATION
363 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
364 {
365 	start_thread_common(regs, new_ip, new_sp,
366 			    __USER32_CS, __USER32_DS, __USER32_DS);
367 }
368 #endif
369 
370 /*
371  *	switch_to(x,y) should switch tasks from x to y.
372  *
373  * This could still be optimized:
374  * - fold all the options into a flag word and test it with a single test.
375  * - could test fs/gs bitsliced
376  *
377  * Kprobes not supported here. Set the probe on schedule instead.
378  * Function graph tracer not supported too.
379  */
380 __notrace_funcgraph struct task_struct *
381 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
382 {
383 	struct thread_struct *prev = &prev_p->thread;
384 	struct thread_struct *next = &next_p->thread;
385 	int cpu = smp_processor_id();
386 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
387 	unsigned fsindex, gsindex;
388 	fpu_switch_t fpu;
389 
390 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
391 
392 	/*
393 	 * Reload esp0, LDT and the page table pointer:
394 	 */
395 	load_sp0(tss, next);
396 
397 	/*
398 	 * Switch DS and ES.
399 	 * This won't pick up thread selector changes, but I guess that is ok.
400 	 */
401 	savesegment(es, prev->es);
402 	if (unlikely(next->es | prev->es))
403 		loadsegment(es, next->es);
404 
405 	savesegment(ds, prev->ds);
406 	if (unlikely(next->ds | prev->ds))
407 		loadsegment(ds, next->ds);
408 
409 
410 	/* We must save %fs and %gs before load_TLS() because
411 	 * %fs and %gs may be cleared by load_TLS().
412 	 *
413 	 * (e.g. xen_load_tls())
414 	 */
415 	savesegment(fs, fsindex);
416 	savesegment(gs, gsindex);
417 
418 	load_TLS(next, cpu);
419 
420 	/*
421 	 * Leave lazy mode, flushing any hypercalls made here.
422 	 * This must be done before restoring TLS segments so
423 	 * the GDT and LDT are properly updated, and must be
424 	 * done before math_state_restore, so the TS bit is up
425 	 * to date.
426 	 */
427 	arch_end_context_switch(next_p);
428 
429 	/*
430 	 * Switch FS and GS.
431 	 *
432 	 * Segment register != 0 always requires a reload.  Also
433 	 * reload when it has changed.  When prev process used 64bit
434 	 * base always reload to avoid an information leak.
435 	 */
436 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
437 		loadsegment(fs, next->fsindex);
438 		/*
439 		 * Check if the user used a selector != 0; if yes
440 		 *  clear 64bit base, since overloaded base is always
441 		 *  mapped to the Null selector
442 		 */
443 		if (fsindex)
444 			prev->fs = 0;
445 	}
446 	/* when next process has a 64bit base use it */
447 	if (next->fs)
448 		wrmsrl(MSR_FS_BASE, next->fs);
449 	prev->fsindex = fsindex;
450 
451 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
452 		load_gs_index(next->gsindex);
453 		if (gsindex)
454 			prev->gs = 0;
455 	}
456 	if (next->gs)
457 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
458 	prev->gsindex = gsindex;
459 
460 	switch_fpu_finish(next_p, fpu);
461 
462 	/*
463 	 * Switch the PDA and FPU contexts.
464 	 */
465 	prev->usersp = percpu_read(old_rsp);
466 	percpu_write(old_rsp, next->usersp);
467 	percpu_write(current_task, next_p);
468 
469 	percpu_write(kernel_stack,
470 		  (unsigned long)task_stack_page(next_p) +
471 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
472 
473 	/*
474 	 * Now maybe reload the debug registers and handle I/O bitmaps
475 	 */
476 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
477 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
478 		__switch_to_xtra(prev_p, next_p, tss);
479 
480 	return prev_p;
481 }
482 
483 void set_personality_64bit(void)
484 {
485 	/* inherit personality from parent */
486 
487 	/* Make sure to be in 64bit mode */
488 	clear_thread_flag(TIF_IA32);
489 
490 	/* Ensure the corresponding mm is not marked. */
491 	if (current->mm)
492 		current->mm->context.ia32_compat = 0;
493 
494 	/* TBD: overwrites user setup. Should have two bits.
495 	   But 64bit processes have always behaved this way,
496 	   so it's not too bad. The main problem is just that
497 	   32bit childs are affected again. */
498 	current->personality &= ~READ_IMPLIES_EXEC;
499 }
500 
501 void set_personality_ia32(void)
502 {
503 	/* inherit personality from parent */
504 
505 	/* Make sure to be in 32bit mode */
506 	set_thread_flag(TIF_IA32);
507 	current->personality |= force_personality32;
508 
509 	/* Mark the associated mm as containing 32-bit tasks. */
510 	if (current->mm)
511 		current->mm->context.ia32_compat = 1;
512 
513 	/* Prepare the first "return" to user space */
514 	current_thread_info()->status |= TS_COMPAT;
515 }
516 
517 unsigned long get_wchan(struct task_struct *p)
518 {
519 	unsigned long stack;
520 	u64 fp, ip;
521 	int count = 0;
522 
523 	if (!p || p == current || p->state == TASK_RUNNING)
524 		return 0;
525 	stack = (unsigned long)task_stack_page(p);
526 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
527 		return 0;
528 	fp = *(u64 *)(p->thread.sp);
529 	do {
530 		if (fp < (unsigned long)stack ||
531 		    fp >= (unsigned long)stack+THREAD_SIZE)
532 			return 0;
533 		ip = *(u64 *)(fp+8);
534 		if (!in_sched_functions(ip))
535 			return ip;
536 		fp = *(u64 *)fp;
537 	} while (count++ < 16);
538 	return 0;
539 }
540 
541 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
542 {
543 	int ret = 0;
544 	int doit = task == current;
545 	int cpu;
546 
547 	switch (code) {
548 	case ARCH_SET_GS:
549 		if (addr >= TASK_SIZE_OF(task))
550 			return -EPERM;
551 		cpu = get_cpu();
552 		/* handle small bases via the GDT because that's faster to
553 		   switch. */
554 		if (addr <= 0xffffffff) {
555 			set_32bit_tls(task, GS_TLS, addr);
556 			if (doit) {
557 				load_TLS(&task->thread, cpu);
558 				load_gs_index(GS_TLS_SEL);
559 			}
560 			task->thread.gsindex = GS_TLS_SEL;
561 			task->thread.gs = 0;
562 		} else {
563 			task->thread.gsindex = 0;
564 			task->thread.gs = addr;
565 			if (doit) {
566 				load_gs_index(0);
567 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
568 			}
569 		}
570 		put_cpu();
571 		break;
572 	case ARCH_SET_FS:
573 		/* Not strictly needed for fs, but do it for symmetry
574 		   with gs */
575 		if (addr >= TASK_SIZE_OF(task))
576 			return -EPERM;
577 		cpu = get_cpu();
578 		/* handle small bases via the GDT because that's faster to
579 		   switch. */
580 		if (addr <= 0xffffffff) {
581 			set_32bit_tls(task, FS_TLS, addr);
582 			if (doit) {
583 				load_TLS(&task->thread, cpu);
584 				loadsegment(fs, FS_TLS_SEL);
585 			}
586 			task->thread.fsindex = FS_TLS_SEL;
587 			task->thread.fs = 0;
588 		} else {
589 			task->thread.fsindex = 0;
590 			task->thread.fs = addr;
591 			if (doit) {
592 				/* set the selector to 0 to not confuse
593 				   __switch_to */
594 				loadsegment(fs, 0);
595 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
596 			}
597 		}
598 		put_cpu();
599 		break;
600 	case ARCH_GET_FS: {
601 		unsigned long base;
602 		if (task->thread.fsindex == FS_TLS_SEL)
603 			base = read_32bit_tls(task, FS_TLS);
604 		else if (doit)
605 			rdmsrl(MSR_FS_BASE, base);
606 		else
607 			base = task->thread.fs;
608 		ret = put_user(base, (unsigned long __user *)addr);
609 		break;
610 	}
611 	case ARCH_GET_GS: {
612 		unsigned long base;
613 		unsigned gsindex;
614 		if (task->thread.gsindex == GS_TLS_SEL)
615 			base = read_32bit_tls(task, GS_TLS);
616 		else if (doit) {
617 			savesegment(gs, gsindex);
618 			if (gsindex)
619 				rdmsrl(MSR_KERNEL_GS_BASE, base);
620 			else
621 				base = task->thread.gs;
622 		} else
623 			base = task->thread.gs;
624 		ret = put_user(base, (unsigned long __user *)addr);
625 		break;
626 	}
627 
628 	default:
629 		ret = -EINVAL;
630 		break;
631 	}
632 
633 	return ret;
634 }
635 
636 long sys_arch_prctl(int code, unsigned long addr)
637 {
638 	return do_arch_prctl(current, code, addr);
639 }
640 
641 unsigned long KSTK_ESP(struct task_struct *task)
642 {
643 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
644 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
645 }
646