xref: /linux/arch/x86/kernel/process_64.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/ds.h>
53 #include <asm/debugreg.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 DEFINE_PER_CPU(unsigned long, old_rsp);
58 static DEFINE_PER_CPU(unsigned char, is_idle);
59 
60 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 
62 void idle_notifier_register(struct notifier_block *n)
63 {
64 	atomic_notifier_chain_register(&idle_notifier, n);
65 }
66 EXPORT_SYMBOL_GPL(idle_notifier_register);
67 
68 void idle_notifier_unregister(struct notifier_block *n)
69 {
70 	atomic_notifier_chain_unregister(&idle_notifier, n);
71 }
72 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
73 
74 void enter_idle(void)
75 {
76 	percpu_write(is_idle, 1);
77 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
78 }
79 
80 static void __exit_idle(void)
81 {
82 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
83 		return;
84 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
85 }
86 
87 /* Called from interrupts to signify idle end */
88 void exit_idle(void)
89 {
90 	/* idle loop has pid 0 */
91 	if (current->pid)
92 		return;
93 	__exit_idle();
94 }
95 
96 #ifndef CONFIG_SMP
97 static inline void play_dead(void)
98 {
99 	BUG();
100 }
101 #endif
102 
103 /*
104  * The idle thread. There's no useful work to be
105  * done, so just try to conserve power and have a
106  * low exit latency (ie sit in a loop waiting for
107  * somebody to say that they'd like to reschedule)
108  */
109 void cpu_idle(void)
110 {
111 	current_thread_info()->status |= TS_POLLING;
112 
113 	/*
114 	 * If we're the non-boot CPU, nothing set the stack canary up
115 	 * for us.  CPU0 already has it initialized but no harm in
116 	 * doing it again.  This is a good place for updating it, as
117 	 * we wont ever return from this function (so the invalid
118 	 * canaries already on the stack wont ever trigger).
119 	 */
120 	boot_init_stack_canary();
121 
122 	/* endless idle loop with no priority at all */
123 	while (1) {
124 		tick_nohz_stop_sched_tick(1);
125 		while (!need_resched()) {
126 
127 			rmb();
128 
129 			if (cpu_is_offline(smp_processor_id()))
130 				play_dead();
131 			/*
132 			 * Idle routines should keep interrupts disabled
133 			 * from here on, until they go to idle.
134 			 * Otherwise, idle callbacks can misfire.
135 			 */
136 			local_irq_disable();
137 			enter_idle();
138 			/* Don't trace irqs off for idle */
139 			stop_critical_timings();
140 			pm_idle();
141 			start_critical_timings();
142 			/* In many cases the interrupt that ended idle
143 			   has already called exit_idle. But some idle
144 			   loops can be woken up without interrupt. */
145 			__exit_idle();
146 		}
147 
148 		tick_nohz_restart_sched_tick();
149 		preempt_enable_no_resched();
150 		schedule();
151 		preempt_disable();
152 	}
153 }
154 
155 /* Prints also some state that isn't saved in the pt_regs */
156 void __show_regs(struct pt_regs *regs, int all)
157 {
158 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 	unsigned long d0, d1, d2, d3, d6, d7;
160 	unsigned int fsindex, gsindex;
161 	unsigned int ds, cs, es;
162 
163 	show_regs_common();
164 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
165 	printk_address(regs->ip, 1);
166 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
167 			regs->sp, regs->flags);
168 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
169 	       regs->ax, regs->bx, regs->cx);
170 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
171 	       regs->dx, regs->si, regs->di);
172 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
173 	       regs->bp, regs->r8, regs->r9);
174 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
175 	       regs->r10, regs->r11, regs->r12);
176 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
177 	       regs->r13, regs->r14, regs->r15);
178 
179 	asm("movl %%ds,%0" : "=r" (ds));
180 	asm("movl %%cs,%0" : "=r" (cs));
181 	asm("movl %%es,%0" : "=r" (es));
182 	asm("movl %%fs,%0" : "=r" (fsindex));
183 	asm("movl %%gs,%0" : "=r" (gsindex));
184 
185 	rdmsrl(MSR_FS_BASE, fs);
186 	rdmsrl(MSR_GS_BASE, gs);
187 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
188 
189 	if (!all)
190 		return;
191 
192 	cr0 = read_cr0();
193 	cr2 = read_cr2();
194 	cr3 = read_cr3();
195 	cr4 = read_cr4();
196 
197 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
198 	       fs, fsindex, gs, gsindex, shadowgs);
199 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
200 			es, cr0);
201 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
202 			cr4);
203 
204 	get_debugreg(d0, 0);
205 	get_debugreg(d1, 1);
206 	get_debugreg(d2, 2);
207 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 	get_debugreg(d3, 3);
209 	get_debugreg(d6, 6);
210 	get_debugreg(d7, 7);
211 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212 }
213 
214 void release_thread(struct task_struct *dead_task)
215 {
216 	if (dead_task->mm) {
217 		if (dead_task->mm->context.size) {
218 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
219 					dead_task->comm,
220 					dead_task->mm->context.ldt,
221 					dead_task->mm->context.size);
222 			BUG();
223 		}
224 	}
225 }
226 
227 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
228 {
229 	struct user_desc ud = {
230 		.base_addr = addr,
231 		.limit = 0xfffff,
232 		.seg_32bit = 1,
233 		.limit_in_pages = 1,
234 		.useable = 1,
235 	};
236 	struct desc_struct *desc = t->thread.tls_array;
237 	desc += tls;
238 	fill_ldt(desc, &ud);
239 }
240 
241 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
242 {
243 	return get_desc_base(&t->thread.tls_array[tls]);
244 }
245 
246 /*
247  * This gets called before we allocate a new thread and copy
248  * the current task into it.
249  */
250 void prepare_to_copy(struct task_struct *tsk)
251 {
252 	unlazy_fpu(tsk);
253 }
254 
255 int copy_thread(unsigned long clone_flags, unsigned long sp,
256 		unsigned long unused,
257 	struct task_struct *p, struct pt_regs *regs)
258 {
259 	int err;
260 	struct pt_regs *childregs;
261 	struct task_struct *me = current;
262 
263 	childregs = ((struct pt_regs *)
264 			(THREAD_SIZE + task_stack_page(p))) - 1;
265 	*childregs = *regs;
266 
267 	childregs->ax = 0;
268 	if (user_mode(regs))
269 		childregs->sp = sp;
270 	else
271 		childregs->sp = (unsigned long)childregs;
272 
273 	p->thread.sp = (unsigned long) childregs;
274 	p->thread.sp0 = (unsigned long) (childregs+1);
275 	p->thread.usersp = me->thread.usersp;
276 
277 	set_tsk_thread_flag(p, TIF_FORK);
278 
279 	p->thread.fs = me->thread.fs;
280 	p->thread.gs = me->thread.gs;
281 	p->thread.io_bitmap_ptr = NULL;
282 
283 	savesegment(gs, p->thread.gsindex);
284 	savesegment(fs, p->thread.fsindex);
285 	savesegment(es, p->thread.es);
286 	savesegment(ds, p->thread.ds);
287 
288 	err = -ENOMEM;
289 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
290 
291 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
292 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
293 		if (!p->thread.io_bitmap_ptr) {
294 			p->thread.io_bitmap_max = 0;
295 			return -ENOMEM;
296 		}
297 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
298 				IO_BITMAP_BYTES);
299 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
300 	}
301 
302 	/*
303 	 * Set a new TLS for the child thread?
304 	 */
305 	if (clone_flags & CLONE_SETTLS) {
306 #ifdef CONFIG_IA32_EMULATION
307 		if (test_thread_flag(TIF_IA32))
308 			err = do_set_thread_area(p, -1,
309 				(struct user_desc __user *)childregs->si, 0);
310 		else
311 #endif
312 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
313 		if (err)
314 			goto out;
315 	}
316 
317 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
318 	p->thread.ds_ctx = NULL;
319 
320 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
321 	p->thread.debugctlmsr = 0;
322 
323 	err = 0;
324 out:
325 	if (err && p->thread.io_bitmap_ptr) {
326 		kfree(p->thread.io_bitmap_ptr);
327 		p->thread.io_bitmap_max = 0;
328 	}
329 
330 	return err;
331 }
332 
333 static void
334 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
335 		    unsigned long new_sp,
336 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
337 {
338 	loadsegment(fs, 0);
339 	loadsegment(es, _ds);
340 	loadsegment(ds, _ds);
341 	load_gs_index(0);
342 	regs->ip		= new_ip;
343 	regs->sp		= new_sp;
344 	percpu_write(old_rsp, new_sp);
345 	regs->cs		= _cs;
346 	regs->ss		= _ss;
347 	regs->flags		= X86_EFLAGS_IF;
348 	set_fs(USER_DS);
349 	/*
350 	 * Free the old FP and other extended state
351 	 */
352 	free_thread_xstate(current);
353 }
354 
355 void
356 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
357 {
358 	start_thread_common(regs, new_ip, new_sp,
359 			    __USER_CS, __USER_DS, 0);
360 }
361 
362 #ifdef CONFIG_IA32_EMULATION
363 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
364 {
365 	start_thread_common(regs, new_ip, new_sp,
366 			    __USER32_CS, __USER32_DS, __USER32_DS);
367 }
368 #endif
369 
370 /*
371  *	switch_to(x,y) should switch tasks from x to y.
372  *
373  * This could still be optimized:
374  * - fold all the options into a flag word and test it with a single test.
375  * - could test fs/gs bitsliced
376  *
377  * Kprobes not supported here. Set the probe on schedule instead.
378  * Function graph tracer not supported too.
379  */
380 __notrace_funcgraph struct task_struct *
381 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
382 {
383 	struct thread_struct *prev = &prev_p->thread;
384 	struct thread_struct *next = &next_p->thread;
385 	int cpu = smp_processor_id();
386 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
387 	unsigned fsindex, gsindex;
388 	bool preload_fpu;
389 
390 	/*
391 	 * If the task has used fpu the last 5 timeslices, just do a full
392 	 * restore of the math state immediately to avoid the trap; the
393 	 * chances of needing FPU soon are obviously high now
394 	 */
395 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
396 
397 	/* we're going to use this soon, after a few expensive things */
398 	if (preload_fpu)
399 		prefetch(next->xstate);
400 
401 	/*
402 	 * Reload esp0, LDT and the page table pointer:
403 	 */
404 	load_sp0(tss, next);
405 
406 	/*
407 	 * Switch DS and ES.
408 	 * This won't pick up thread selector changes, but I guess that is ok.
409 	 */
410 	savesegment(es, prev->es);
411 	if (unlikely(next->es | prev->es))
412 		loadsegment(es, next->es);
413 
414 	savesegment(ds, prev->ds);
415 	if (unlikely(next->ds | prev->ds))
416 		loadsegment(ds, next->ds);
417 
418 
419 	/* We must save %fs and %gs before load_TLS() because
420 	 * %fs and %gs may be cleared by load_TLS().
421 	 *
422 	 * (e.g. xen_load_tls())
423 	 */
424 	savesegment(fs, fsindex);
425 	savesegment(gs, gsindex);
426 
427 	load_TLS(next, cpu);
428 
429 	/* Must be after DS reload */
430 	unlazy_fpu(prev_p);
431 
432 	/* Make sure cpu is ready for new context */
433 	if (preload_fpu)
434 		clts();
435 
436 	/*
437 	 * Leave lazy mode, flushing any hypercalls made here.
438 	 * This must be done before restoring TLS segments so
439 	 * the GDT and LDT are properly updated, and must be
440 	 * done before math_state_restore, so the TS bit is up
441 	 * to date.
442 	 */
443 	arch_end_context_switch(next_p);
444 
445 	/*
446 	 * Switch FS and GS.
447 	 *
448 	 * Segment register != 0 always requires a reload.  Also
449 	 * reload when it has changed.  When prev process used 64bit
450 	 * base always reload to avoid an information leak.
451 	 */
452 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
453 		loadsegment(fs, next->fsindex);
454 		/*
455 		 * Check if the user used a selector != 0; if yes
456 		 *  clear 64bit base, since overloaded base is always
457 		 *  mapped to the Null selector
458 		 */
459 		if (fsindex)
460 			prev->fs = 0;
461 	}
462 	/* when next process has a 64bit base use it */
463 	if (next->fs)
464 		wrmsrl(MSR_FS_BASE, next->fs);
465 	prev->fsindex = fsindex;
466 
467 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
468 		load_gs_index(next->gsindex);
469 		if (gsindex)
470 			prev->gs = 0;
471 	}
472 	if (next->gs)
473 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
474 	prev->gsindex = gsindex;
475 
476 	/*
477 	 * Switch the PDA and FPU contexts.
478 	 */
479 	prev->usersp = percpu_read(old_rsp);
480 	percpu_write(old_rsp, next->usersp);
481 	percpu_write(current_task, next_p);
482 
483 	percpu_write(kernel_stack,
484 		  (unsigned long)task_stack_page(next_p) +
485 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
486 
487 	/*
488 	 * Now maybe reload the debug registers and handle I/O bitmaps
489 	 */
490 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
491 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
492 		__switch_to_xtra(prev_p, next_p, tss);
493 
494 	/*
495 	 * Preload the FPU context, now that we've determined that the
496 	 * task is likely to be using it.
497 	 */
498 	if (preload_fpu)
499 		__math_state_restore();
500 
501 	return prev_p;
502 }
503 
504 void set_personality_64bit(void)
505 {
506 	/* inherit personality from parent */
507 
508 	/* Make sure to be in 64bit mode */
509 	clear_thread_flag(TIF_IA32);
510 
511 	/* TBD: overwrites user setup. Should have two bits.
512 	   But 64bit processes have always behaved this way,
513 	   so it's not too bad. The main problem is just that
514 	   32bit childs are affected again. */
515 	current->personality &= ~READ_IMPLIES_EXEC;
516 }
517 
518 void set_personality_ia32(void)
519 {
520 	/* inherit personality from parent */
521 
522 	/* Make sure to be in 32bit mode */
523 	set_thread_flag(TIF_IA32);
524 	current->personality |= force_personality32;
525 
526 	/* Prepare the first "return" to user space */
527 	current_thread_info()->status |= TS_COMPAT;
528 }
529 
530 unsigned long get_wchan(struct task_struct *p)
531 {
532 	unsigned long stack;
533 	u64 fp, ip;
534 	int count = 0;
535 
536 	if (!p || p == current || p->state == TASK_RUNNING)
537 		return 0;
538 	stack = (unsigned long)task_stack_page(p);
539 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
540 		return 0;
541 	fp = *(u64 *)(p->thread.sp);
542 	do {
543 		if (fp < (unsigned long)stack ||
544 		    fp >= (unsigned long)stack+THREAD_SIZE)
545 			return 0;
546 		ip = *(u64 *)(fp+8);
547 		if (!in_sched_functions(ip))
548 			return ip;
549 		fp = *(u64 *)fp;
550 	} while (count++ < 16);
551 	return 0;
552 }
553 
554 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
555 {
556 	int ret = 0;
557 	int doit = task == current;
558 	int cpu;
559 
560 	switch (code) {
561 	case ARCH_SET_GS:
562 		if (addr >= TASK_SIZE_OF(task))
563 			return -EPERM;
564 		cpu = get_cpu();
565 		/* handle small bases via the GDT because that's faster to
566 		   switch. */
567 		if (addr <= 0xffffffff) {
568 			set_32bit_tls(task, GS_TLS, addr);
569 			if (doit) {
570 				load_TLS(&task->thread, cpu);
571 				load_gs_index(GS_TLS_SEL);
572 			}
573 			task->thread.gsindex = GS_TLS_SEL;
574 			task->thread.gs = 0;
575 		} else {
576 			task->thread.gsindex = 0;
577 			task->thread.gs = addr;
578 			if (doit) {
579 				load_gs_index(0);
580 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
581 			}
582 		}
583 		put_cpu();
584 		break;
585 	case ARCH_SET_FS:
586 		/* Not strictly needed for fs, but do it for symmetry
587 		   with gs */
588 		if (addr >= TASK_SIZE_OF(task))
589 			return -EPERM;
590 		cpu = get_cpu();
591 		/* handle small bases via the GDT because that's faster to
592 		   switch. */
593 		if (addr <= 0xffffffff) {
594 			set_32bit_tls(task, FS_TLS, addr);
595 			if (doit) {
596 				load_TLS(&task->thread, cpu);
597 				loadsegment(fs, FS_TLS_SEL);
598 			}
599 			task->thread.fsindex = FS_TLS_SEL;
600 			task->thread.fs = 0;
601 		} else {
602 			task->thread.fsindex = 0;
603 			task->thread.fs = addr;
604 			if (doit) {
605 				/* set the selector to 0 to not confuse
606 				   __switch_to */
607 				loadsegment(fs, 0);
608 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
609 			}
610 		}
611 		put_cpu();
612 		break;
613 	case ARCH_GET_FS: {
614 		unsigned long base;
615 		if (task->thread.fsindex == FS_TLS_SEL)
616 			base = read_32bit_tls(task, FS_TLS);
617 		else if (doit)
618 			rdmsrl(MSR_FS_BASE, base);
619 		else
620 			base = task->thread.fs;
621 		ret = put_user(base, (unsigned long __user *)addr);
622 		break;
623 	}
624 	case ARCH_GET_GS: {
625 		unsigned long base;
626 		unsigned gsindex;
627 		if (task->thread.gsindex == GS_TLS_SEL)
628 			base = read_32bit_tls(task, GS_TLS);
629 		else if (doit) {
630 			savesegment(gs, gsindex);
631 			if (gsindex)
632 				rdmsrl(MSR_KERNEL_GS_BASE, base);
633 			else
634 				base = task->thread.gs;
635 		} else
636 			base = task->thread.gs;
637 		ret = put_user(base, (unsigned long __user *)addr);
638 		break;
639 	}
640 
641 	default:
642 		ret = -EINVAL;
643 		break;
644 	}
645 
646 	return ret;
647 }
648 
649 long sys_arch_prctl(int code, unsigned long addr)
650 {
651 	return do_arch_prctl(current, code, addr);
652 }
653 
654 unsigned long KSTK_ESP(struct task_struct *task)
655 {
656 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
657 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
658 }
659