xref: /linux/arch/x86/kernel/process_64.c (revision 092e0e7e520a1fca03e13c9f2d157432a8657ff2)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/debugreg.h>
53 
54 #include <trace/events/power.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 DEFINE_PER_CPU(unsigned long, old_rsp);
59 static DEFINE_PER_CPU(unsigned char, is_idle);
60 
61 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 
63 void idle_notifier_register(struct notifier_block *n)
64 {
65 	atomic_notifier_chain_register(&idle_notifier, n);
66 }
67 EXPORT_SYMBOL_GPL(idle_notifier_register);
68 
69 void idle_notifier_unregister(struct notifier_block *n)
70 {
71 	atomic_notifier_chain_unregister(&idle_notifier, n);
72 }
73 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74 
75 void enter_idle(void)
76 {
77 	percpu_write(is_idle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 #ifndef CONFIG_SMP
98 static inline void play_dead(void)
99 {
100 	BUG();
101 }
102 #endif
103 
104 /*
105  * The idle thread. There's no useful work to be
106  * done, so just try to conserve power and have a
107  * low exit latency (ie sit in a loop waiting for
108  * somebody to say that they'd like to reschedule)
109  */
110 void cpu_idle(void)
111 {
112 	current_thread_info()->status |= TS_POLLING;
113 
114 	/*
115 	 * If we're the non-boot CPU, nothing set the stack canary up
116 	 * for us.  CPU0 already has it initialized but no harm in
117 	 * doing it again.  This is a good place for updating it, as
118 	 * we wont ever return from this function (so the invalid
119 	 * canaries already on the stack wont ever trigger).
120 	 */
121 	boot_init_stack_canary();
122 
123 	/* endless idle loop with no priority at all */
124 	while (1) {
125 		tick_nohz_stop_sched_tick(1);
126 		while (!need_resched()) {
127 
128 			rmb();
129 
130 			if (cpu_is_offline(smp_processor_id()))
131 				play_dead();
132 			/*
133 			 * Idle routines should keep interrupts disabled
134 			 * from here on, until they go to idle.
135 			 * Otherwise, idle callbacks can misfire.
136 			 */
137 			local_irq_disable();
138 			enter_idle();
139 			/* Don't trace irqs off for idle */
140 			stop_critical_timings();
141 			pm_idle();
142 			start_critical_timings();
143 
144 			trace_power_end(smp_processor_id());
145 
146 			/* In many cases the interrupt that ended idle
147 			   has already called exit_idle. But some idle
148 			   loops can be woken up without interrupt. */
149 			__exit_idle();
150 		}
151 
152 		tick_nohz_restart_sched_tick();
153 		preempt_enable_no_resched();
154 		schedule();
155 		preempt_disable();
156 	}
157 }
158 
159 /* Prints also some state that isn't saved in the pt_regs */
160 void __show_regs(struct pt_regs *regs, int all)
161 {
162 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163 	unsigned long d0, d1, d2, d3, d6, d7;
164 	unsigned int fsindex, gsindex;
165 	unsigned int ds, cs, es;
166 
167 	show_regs_common();
168 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169 	printk_address(regs->ip, 1);
170 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
171 			regs->sp, regs->flags);
172 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 	       regs->ax, regs->bx, regs->cx);
174 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 	       regs->dx, regs->si, regs->di);
176 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
177 	       regs->bp, regs->r8, regs->r9);
178 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
179 	       regs->r10, regs->r11, regs->r12);
180 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
181 	       regs->r13, regs->r14, regs->r15);
182 
183 	asm("movl %%ds,%0" : "=r" (ds));
184 	asm("movl %%cs,%0" : "=r" (cs));
185 	asm("movl %%es,%0" : "=r" (es));
186 	asm("movl %%fs,%0" : "=r" (fsindex));
187 	asm("movl %%gs,%0" : "=r" (gsindex));
188 
189 	rdmsrl(MSR_FS_BASE, fs);
190 	rdmsrl(MSR_GS_BASE, gs);
191 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
192 
193 	if (!all)
194 		return;
195 
196 	cr0 = read_cr0();
197 	cr2 = read_cr2();
198 	cr3 = read_cr3();
199 	cr4 = read_cr4();
200 
201 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
202 	       fs, fsindex, gs, gsindex, shadowgs);
203 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
204 			es, cr0);
205 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
206 			cr4);
207 
208 	get_debugreg(d0, 0);
209 	get_debugreg(d1, 1);
210 	get_debugreg(d2, 2);
211 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
212 	get_debugreg(d3, 3);
213 	get_debugreg(d6, 6);
214 	get_debugreg(d7, 7);
215 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
216 }
217 
218 void release_thread(struct task_struct *dead_task)
219 {
220 	if (dead_task->mm) {
221 		if (dead_task->mm->context.size) {
222 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
223 					dead_task->comm,
224 					dead_task->mm->context.ldt,
225 					dead_task->mm->context.size);
226 			BUG();
227 		}
228 	}
229 }
230 
231 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
232 {
233 	struct user_desc ud = {
234 		.base_addr = addr,
235 		.limit = 0xfffff,
236 		.seg_32bit = 1,
237 		.limit_in_pages = 1,
238 		.useable = 1,
239 	};
240 	struct desc_struct *desc = t->thread.tls_array;
241 	desc += tls;
242 	fill_ldt(desc, &ud);
243 }
244 
245 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
246 {
247 	return get_desc_base(&t->thread.tls_array[tls]);
248 }
249 
250 /*
251  * This gets called before we allocate a new thread and copy
252  * the current task into it.
253  */
254 void prepare_to_copy(struct task_struct *tsk)
255 {
256 	unlazy_fpu(tsk);
257 }
258 
259 int copy_thread(unsigned long clone_flags, unsigned long sp,
260 		unsigned long unused,
261 	struct task_struct *p, struct pt_regs *regs)
262 {
263 	int err;
264 	struct pt_regs *childregs;
265 	struct task_struct *me = current;
266 
267 	childregs = ((struct pt_regs *)
268 			(THREAD_SIZE + task_stack_page(p))) - 1;
269 	*childregs = *regs;
270 
271 	childregs->ax = 0;
272 	if (user_mode(regs))
273 		childregs->sp = sp;
274 	else
275 		childregs->sp = (unsigned long)childregs;
276 
277 	p->thread.sp = (unsigned long) childregs;
278 	p->thread.sp0 = (unsigned long) (childregs+1);
279 	p->thread.usersp = me->thread.usersp;
280 
281 	set_tsk_thread_flag(p, TIF_FORK);
282 
283 	p->thread.io_bitmap_ptr = NULL;
284 
285 	savesegment(gs, p->thread.gsindex);
286 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
287 	savesegment(fs, p->thread.fsindex);
288 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
289 	savesegment(es, p->thread.es);
290 	savesegment(ds, p->thread.ds);
291 
292 	err = -ENOMEM;
293 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
294 
295 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
296 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
297 		if (!p->thread.io_bitmap_ptr) {
298 			p->thread.io_bitmap_max = 0;
299 			return -ENOMEM;
300 		}
301 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
302 				IO_BITMAP_BYTES);
303 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
304 	}
305 
306 	/*
307 	 * Set a new TLS for the child thread?
308 	 */
309 	if (clone_flags & CLONE_SETTLS) {
310 #ifdef CONFIG_IA32_EMULATION
311 		if (test_thread_flag(TIF_IA32))
312 			err = do_set_thread_area(p, -1,
313 				(struct user_desc __user *)childregs->si, 0);
314 		else
315 #endif
316 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
317 		if (err)
318 			goto out;
319 	}
320 	err = 0;
321 out:
322 	if (err && p->thread.io_bitmap_ptr) {
323 		kfree(p->thread.io_bitmap_ptr);
324 		p->thread.io_bitmap_max = 0;
325 	}
326 
327 	return err;
328 }
329 
330 static void
331 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
332 		    unsigned long new_sp,
333 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
334 {
335 	loadsegment(fs, 0);
336 	loadsegment(es, _ds);
337 	loadsegment(ds, _ds);
338 	load_gs_index(0);
339 	regs->ip		= new_ip;
340 	regs->sp		= new_sp;
341 	percpu_write(old_rsp, new_sp);
342 	regs->cs		= _cs;
343 	regs->ss		= _ss;
344 	regs->flags		= X86_EFLAGS_IF;
345 	set_fs(USER_DS);
346 	/*
347 	 * Free the old FP and other extended state
348 	 */
349 	free_thread_xstate(current);
350 }
351 
352 void
353 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
354 {
355 	start_thread_common(regs, new_ip, new_sp,
356 			    __USER_CS, __USER_DS, 0);
357 }
358 
359 #ifdef CONFIG_IA32_EMULATION
360 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
361 {
362 	start_thread_common(regs, new_ip, new_sp,
363 			    __USER32_CS, __USER32_DS, __USER32_DS);
364 }
365 #endif
366 
367 /*
368  *	switch_to(x,y) should switch tasks from x to y.
369  *
370  * This could still be optimized:
371  * - fold all the options into a flag word and test it with a single test.
372  * - could test fs/gs bitsliced
373  *
374  * Kprobes not supported here. Set the probe on schedule instead.
375  * Function graph tracer not supported too.
376  */
377 __notrace_funcgraph struct task_struct *
378 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
379 {
380 	struct thread_struct *prev = &prev_p->thread;
381 	struct thread_struct *next = &next_p->thread;
382 	int cpu = smp_processor_id();
383 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
384 	unsigned fsindex, gsindex;
385 	bool preload_fpu;
386 
387 	/*
388 	 * If the task has used fpu the last 5 timeslices, just do a full
389 	 * restore of the math state immediately to avoid the trap; the
390 	 * chances of needing FPU soon are obviously high now
391 	 */
392 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
393 
394 	/* we're going to use this soon, after a few expensive things */
395 	if (preload_fpu)
396 		prefetch(next->fpu.state);
397 
398 	/*
399 	 * Reload esp0, LDT and the page table pointer:
400 	 */
401 	load_sp0(tss, next);
402 
403 	/*
404 	 * Switch DS and ES.
405 	 * This won't pick up thread selector changes, but I guess that is ok.
406 	 */
407 	savesegment(es, prev->es);
408 	if (unlikely(next->es | prev->es))
409 		loadsegment(es, next->es);
410 
411 	savesegment(ds, prev->ds);
412 	if (unlikely(next->ds | prev->ds))
413 		loadsegment(ds, next->ds);
414 
415 
416 	/* We must save %fs and %gs before load_TLS() because
417 	 * %fs and %gs may be cleared by load_TLS().
418 	 *
419 	 * (e.g. xen_load_tls())
420 	 */
421 	savesegment(fs, fsindex);
422 	savesegment(gs, gsindex);
423 
424 	load_TLS(next, cpu);
425 
426 	/* Must be after DS reload */
427 	__unlazy_fpu(prev_p);
428 
429 	/* Make sure cpu is ready for new context */
430 	if (preload_fpu)
431 		clts();
432 
433 	/*
434 	 * Leave lazy mode, flushing any hypercalls made here.
435 	 * This must be done before restoring TLS segments so
436 	 * the GDT and LDT are properly updated, and must be
437 	 * done before math_state_restore, so the TS bit is up
438 	 * to date.
439 	 */
440 	arch_end_context_switch(next_p);
441 
442 	/*
443 	 * Switch FS and GS.
444 	 *
445 	 * Segment register != 0 always requires a reload.  Also
446 	 * reload when it has changed.  When prev process used 64bit
447 	 * base always reload to avoid an information leak.
448 	 */
449 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
450 		loadsegment(fs, next->fsindex);
451 		/*
452 		 * Check if the user used a selector != 0; if yes
453 		 *  clear 64bit base, since overloaded base is always
454 		 *  mapped to the Null selector
455 		 */
456 		if (fsindex)
457 			prev->fs = 0;
458 	}
459 	/* when next process has a 64bit base use it */
460 	if (next->fs)
461 		wrmsrl(MSR_FS_BASE, next->fs);
462 	prev->fsindex = fsindex;
463 
464 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
465 		load_gs_index(next->gsindex);
466 		if (gsindex)
467 			prev->gs = 0;
468 	}
469 	if (next->gs)
470 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
471 	prev->gsindex = gsindex;
472 
473 	/*
474 	 * Switch the PDA and FPU contexts.
475 	 */
476 	prev->usersp = percpu_read(old_rsp);
477 	percpu_write(old_rsp, next->usersp);
478 	percpu_write(current_task, next_p);
479 
480 	percpu_write(kernel_stack,
481 		  (unsigned long)task_stack_page(next_p) +
482 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
483 
484 	/*
485 	 * Now maybe reload the debug registers and handle I/O bitmaps
486 	 */
487 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
488 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
489 		__switch_to_xtra(prev_p, next_p, tss);
490 
491 	/*
492 	 * Preload the FPU context, now that we've determined that the
493 	 * task is likely to be using it.
494 	 */
495 	if (preload_fpu)
496 		__math_state_restore();
497 
498 	return prev_p;
499 }
500 
501 void set_personality_64bit(void)
502 {
503 	/* inherit personality from parent */
504 
505 	/* Make sure to be in 64bit mode */
506 	clear_thread_flag(TIF_IA32);
507 
508 	/* TBD: overwrites user setup. Should have two bits.
509 	   But 64bit processes have always behaved this way,
510 	   so it's not too bad. The main problem is just that
511 	   32bit childs are affected again. */
512 	current->personality &= ~READ_IMPLIES_EXEC;
513 }
514 
515 void set_personality_ia32(void)
516 {
517 	/* inherit personality from parent */
518 
519 	/* Make sure to be in 32bit mode */
520 	set_thread_flag(TIF_IA32);
521 	current->personality |= force_personality32;
522 
523 	/* Prepare the first "return" to user space */
524 	current_thread_info()->status |= TS_COMPAT;
525 }
526 
527 unsigned long get_wchan(struct task_struct *p)
528 {
529 	unsigned long stack;
530 	u64 fp, ip;
531 	int count = 0;
532 
533 	if (!p || p == current || p->state == TASK_RUNNING)
534 		return 0;
535 	stack = (unsigned long)task_stack_page(p);
536 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
537 		return 0;
538 	fp = *(u64 *)(p->thread.sp);
539 	do {
540 		if (fp < (unsigned long)stack ||
541 		    fp >= (unsigned long)stack+THREAD_SIZE)
542 			return 0;
543 		ip = *(u64 *)(fp+8);
544 		if (!in_sched_functions(ip))
545 			return ip;
546 		fp = *(u64 *)fp;
547 	} while (count++ < 16);
548 	return 0;
549 }
550 
551 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
552 {
553 	int ret = 0;
554 	int doit = task == current;
555 	int cpu;
556 
557 	switch (code) {
558 	case ARCH_SET_GS:
559 		if (addr >= TASK_SIZE_OF(task))
560 			return -EPERM;
561 		cpu = get_cpu();
562 		/* handle small bases via the GDT because that's faster to
563 		   switch. */
564 		if (addr <= 0xffffffff) {
565 			set_32bit_tls(task, GS_TLS, addr);
566 			if (doit) {
567 				load_TLS(&task->thread, cpu);
568 				load_gs_index(GS_TLS_SEL);
569 			}
570 			task->thread.gsindex = GS_TLS_SEL;
571 			task->thread.gs = 0;
572 		} else {
573 			task->thread.gsindex = 0;
574 			task->thread.gs = addr;
575 			if (doit) {
576 				load_gs_index(0);
577 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
578 			}
579 		}
580 		put_cpu();
581 		break;
582 	case ARCH_SET_FS:
583 		/* Not strictly needed for fs, but do it for symmetry
584 		   with gs */
585 		if (addr >= TASK_SIZE_OF(task))
586 			return -EPERM;
587 		cpu = get_cpu();
588 		/* handle small bases via the GDT because that's faster to
589 		   switch. */
590 		if (addr <= 0xffffffff) {
591 			set_32bit_tls(task, FS_TLS, addr);
592 			if (doit) {
593 				load_TLS(&task->thread, cpu);
594 				loadsegment(fs, FS_TLS_SEL);
595 			}
596 			task->thread.fsindex = FS_TLS_SEL;
597 			task->thread.fs = 0;
598 		} else {
599 			task->thread.fsindex = 0;
600 			task->thread.fs = addr;
601 			if (doit) {
602 				/* set the selector to 0 to not confuse
603 				   __switch_to */
604 				loadsegment(fs, 0);
605 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
606 			}
607 		}
608 		put_cpu();
609 		break;
610 	case ARCH_GET_FS: {
611 		unsigned long base;
612 		if (task->thread.fsindex == FS_TLS_SEL)
613 			base = read_32bit_tls(task, FS_TLS);
614 		else if (doit)
615 			rdmsrl(MSR_FS_BASE, base);
616 		else
617 			base = task->thread.fs;
618 		ret = put_user(base, (unsigned long __user *)addr);
619 		break;
620 	}
621 	case ARCH_GET_GS: {
622 		unsigned long base;
623 		unsigned gsindex;
624 		if (task->thread.gsindex == GS_TLS_SEL)
625 			base = read_32bit_tls(task, GS_TLS);
626 		else if (doit) {
627 			savesegment(gs, gsindex);
628 			if (gsindex)
629 				rdmsrl(MSR_KERNEL_GS_BASE, base);
630 			else
631 				base = task->thread.gs;
632 		} else
633 			base = task->thread.gs;
634 		ret = put_user(base, (unsigned long __user *)addr);
635 		break;
636 	}
637 
638 	default:
639 		ret = -EINVAL;
640 		break;
641 	}
642 
643 	return ret;
644 }
645 
646 long sys_arch_prctl(int code, unsigned long addr)
647 {
648 	return do_arch_prctl(current, code, addr);
649 }
650 
651 unsigned long KSTK_ESP(struct task_struct *task)
652 {
653 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
654 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
655 }
656