xref: /linux/arch/x86/kernel/process_64.c (revision a5c4300389bb33ade2515c082709217f0614cf15)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/delay.h>
30 #include <linux/module.h>
31 #include <linux/ptrace.h>
32 #include <linux/notifier.h>
33 #include <linux/kprobes.h>
34 #include <linux/kdebug.h>
35 #include <linux/tick.h>
36 #include <linux/prctl.h>
37 #include <linux/uaccess.h>
38 #include <linux/io.h>
39 #include <linux/ftrace.h>
40 
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/idle.h>
51 #include <asm/syscalls.h>
52 #include <asm/debugreg.h>
53 
54 asmlinkage extern void ret_from_fork(void);
55 
56 DEFINE_PER_CPU(unsigned long, old_rsp);
57 static DEFINE_PER_CPU(unsigned char, is_idle);
58 
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
60 
61 void idle_notifier_register(struct notifier_block *n)
62 {
63 	atomic_notifier_chain_register(&idle_notifier, n);
64 }
65 EXPORT_SYMBOL_GPL(idle_notifier_register);
66 
67 void idle_notifier_unregister(struct notifier_block *n)
68 {
69 	atomic_notifier_chain_unregister(&idle_notifier, n);
70 }
71 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
72 
73 void enter_idle(void)
74 {
75 	percpu_write(is_idle, 1);
76 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
77 }
78 
79 static void __exit_idle(void)
80 {
81 	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
82 		return;
83 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
84 }
85 
86 /* Called from interrupts to signify idle end */
87 void exit_idle(void)
88 {
89 	/* idle loop has pid 0 */
90 	if (current->pid)
91 		return;
92 	__exit_idle();
93 }
94 
95 #ifndef CONFIG_SMP
96 static inline void play_dead(void)
97 {
98 	BUG();
99 }
100 #endif
101 
102 /*
103  * The idle thread. There's no useful work to be
104  * done, so just try to conserve power and have a
105  * low exit latency (ie sit in a loop waiting for
106  * somebody to say that they'd like to reschedule)
107  */
108 void cpu_idle(void)
109 {
110 	current_thread_info()->status |= TS_POLLING;
111 
112 	/*
113 	 * If we're the non-boot CPU, nothing set the stack canary up
114 	 * for us.  CPU0 already has it initialized but no harm in
115 	 * doing it again.  This is a good place for updating it, as
116 	 * we wont ever return from this function (so the invalid
117 	 * canaries already on the stack wont ever trigger).
118 	 */
119 	boot_init_stack_canary();
120 
121 	/* endless idle loop with no priority at all */
122 	while (1) {
123 		tick_nohz_stop_sched_tick(1);
124 		while (!need_resched()) {
125 
126 			rmb();
127 
128 			if (cpu_is_offline(smp_processor_id()))
129 				play_dead();
130 			/*
131 			 * Idle routines should keep interrupts disabled
132 			 * from here on, until they go to idle.
133 			 * Otherwise, idle callbacks can misfire.
134 			 */
135 			local_irq_disable();
136 			enter_idle();
137 			/* Don't trace irqs off for idle */
138 			stop_critical_timings();
139 			pm_idle();
140 			start_critical_timings();
141 			/* In many cases the interrupt that ended idle
142 			   has already called exit_idle. But some idle
143 			   loops can be woken up without interrupt. */
144 			__exit_idle();
145 		}
146 
147 		tick_nohz_restart_sched_tick();
148 		preempt_enable_no_resched();
149 		schedule();
150 		preempt_disable();
151 	}
152 }
153 
154 /* Prints also some state that isn't saved in the pt_regs */
155 void __show_regs(struct pt_regs *regs, int all)
156 {
157 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 	unsigned long d0, d1, d2, d3, d6, d7;
159 	unsigned int fsindex, gsindex;
160 	unsigned int ds, cs, es;
161 
162 	show_regs_common();
163 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
164 	printk_address(regs->ip, 1);
165 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
166 			regs->sp, regs->flags);
167 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
168 	       regs->ax, regs->bx, regs->cx);
169 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
170 	       regs->dx, regs->si, regs->di);
171 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
172 	       regs->bp, regs->r8, regs->r9);
173 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
174 	       regs->r10, regs->r11, regs->r12);
175 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
176 	       regs->r13, regs->r14, regs->r15);
177 
178 	asm("movl %%ds,%0" : "=r" (ds));
179 	asm("movl %%cs,%0" : "=r" (cs));
180 	asm("movl %%es,%0" : "=r" (es));
181 	asm("movl %%fs,%0" : "=r" (fsindex));
182 	asm("movl %%gs,%0" : "=r" (gsindex));
183 
184 	rdmsrl(MSR_FS_BASE, fs);
185 	rdmsrl(MSR_GS_BASE, gs);
186 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
187 
188 	if (!all)
189 		return;
190 
191 	cr0 = read_cr0();
192 	cr2 = read_cr2();
193 	cr3 = read_cr3();
194 	cr4 = read_cr4();
195 
196 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
197 	       fs, fsindex, gs, gsindex, shadowgs);
198 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
199 			es, cr0);
200 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
201 			cr4);
202 
203 	get_debugreg(d0, 0);
204 	get_debugreg(d1, 1);
205 	get_debugreg(d2, 2);
206 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
207 	get_debugreg(d3, 3);
208 	get_debugreg(d6, 6);
209 	get_debugreg(d7, 7);
210 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
211 }
212 
213 void release_thread(struct task_struct *dead_task)
214 {
215 	if (dead_task->mm) {
216 		if (dead_task->mm->context.size) {
217 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
218 					dead_task->comm,
219 					dead_task->mm->context.ldt,
220 					dead_task->mm->context.size);
221 			BUG();
222 		}
223 	}
224 }
225 
226 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
227 {
228 	struct user_desc ud = {
229 		.base_addr = addr,
230 		.limit = 0xfffff,
231 		.seg_32bit = 1,
232 		.limit_in_pages = 1,
233 		.useable = 1,
234 	};
235 	struct desc_struct *desc = t->thread.tls_array;
236 	desc += tls;
237 	fill_ldt(desc, &ud);
238 }
239 
240 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
241 {
242 	return get_desc_base(&t->thread.tls_array[tls]);
243 }
244 
245 /*
246  * This gets called before we allocate a new thread and copy
247  * the current task into it.
248  */
249 void prepare_to_copy(struct task_struct *tsk)
250 {
251 	unlazy_fpu(tsk);
252 }
253 
254 int copy_thread(unsigned long clone_flags, unsigned long sp,
255 		unsigned long unused,
256 	struct task_struct *p, struct pt_regs *regs)
257 {
258 	int err;
259 	struct pt_regs *childregs;
260 	struct task_struct *me = current;
261 
262 	childregs = ((struct pt_regs *)
263 			(THREAD_SIZE + task_stack_page(p))) - 1;
264 	*childregs = *regs;
265 
266 	childregs->ax = 0;
267 	if (user_mode(regs))
268 		childregs->sp = sp;
269 	else
270 		childregs->sp = (unsigned long)childregs;
271 
272 	p->thread.sp = (unsigned long) childregs;
273 	p->thread.sp0 = (unsigned long) (childregs+1);
274 	p->thread.usersp = me->thread.usersp;
275 
276 	set_tsk_thread_flag(p, TIF_FORK);
277 
278 	p->thread.io_bitmap_ptr = NULL;
279 
280 	savesegment(gs, p->thread.gsindex);
281 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
282 	savesegment(fs, p->thread.fsindex);
283 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
284 	savesegment(es, p->thread.es);
285 	savesegment(ds, p->thread.ds);
286 
287 	err = -ENOMEM;
288 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
289 
290 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
291 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
292 		if (!p->thread.io_bitmap_ptr) {
293 			p->thread.io_bitmap_max = 0;
294 			return -ENOMEM;
295 		}
296 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
297 				IO_BITMAP_BYTES);
298 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
299 	}
300 
301 	/*
302 	 * Set a new TLS for the child thread?
303 	 */
304 	if (clone_flags & CLONE_SETTLS) {
305 #ifdef CONFIG_IA32_EMULATION
306 		if (test_thread_flag(TIF_IA32))
307 			err = do_set_thread_area(p, -1,
308 				(struct user_desc __user *)childregs->si, 0);
309 		else
310 #endif
311 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
312 		if (err)
313 			goto out;
314 	}
315 	err = 0;
316 out:
317 	if (err && p->thread.io_bitmap_ptr) {
318 		kfree(p->thread.io_bitmap_ptr);
319 		p->thread.io_bitmap_max = 0;
320 	}
321 
322 	return err;
323 }
324 
325 static void
326 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
327 		    unsigned long new_sp,
328 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
329 {
330 	loadsegment(fs, 0);
331 	loadsegment(es, _ds);
332 	loadsegment(ds, _ds);
333 	load_gs_index(0);
334 	regs->ip		= new_ip;
335 	regs->sp		= new_sp;
336 	percpu_write(old_rsp, new_sp);
337 	regs->cs		= _cs;
338 	regs->ss		= _ss;
339 	regs->flags		= X86_EFLAGS_IF;
340 	set_fs(USER_DS);
341 	/*
342 	 * Free the old FP and other extended state
343 	 */
344 	free_thread_xstate(current);
345 }
346 
347 void
348 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
349 {
350 	start_thread_common(regs, new_ip, new_sp,
351 			    __USER_CS, __USER_DS, 0);
352 }
353 
354 #ifdef CONFIG_IA32_EMULATION
355 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
356 {
357 	start_thread_common(regs, new_ip, new_sp,
358 			    __USER32_CS, __USER32_DS, __USER32_DS);
359 }
360 #endif
361 
362 /*
363  *	switch_to(x,y) should switch tasks from x to y.
364  *
365  * This could still be optimized:
366  * - fold all the options into a flag word and test it with a single test.
367  * - could test fs/gs bitsliced
368  *
369  * Kprobes not supported here. Set the probe on schedule instead.
370  * Function graph tracer not supported too.
371  */
372 __notrace_funcgraph struct task_struct *
373 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
374 {
375 	struct thread_struct *prev = &prev_p->thread;
376 	struct thread_struct *next = &next_p->thread;
377 	int cpu = smp_processor_id();
378 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
379 	unsigned fsindex, gsindex;
380 	bool preload_fpu;
381 
382 	/*
383 	 * If the task has used fpu the last 5 timeslices, just do a full
384 	 * restore of the math state immediately to avoid the trap; the
385 	 * chances of needing FPU soon are obviously high now
386 	 */
387 	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
388 
389 	/* we're going to use this soon, after a few expensive things */
390 	if (preload_fpu)
391 		prefetch(next->fpu.state);
392 
393 	/*
394 	 * Reload esp0, LDT and the page table pointer:
395 	 */
396 	load_sp0(tss, next);
397 
398 	/*
399 	 * Switch DS and ES.
400 	 * This won't pick up thread selector changes, but I guess that is ok.
401 	 */
402 	savesegment(es, prev->es);
403 	if (unlikely(next->es | prev->es))
404 		loadsegment(es, next->es);
405 
406 	savesegment(ds, prev->ds);
407 	if (unlikely(next->ds | prev->ds))
408 		loadsegment(ds, next->ds);
409 
410 
411 	/* We must save %fs and %gs before load_TLS() because
412 	 * %fs and %gs may be cleared by load_TLS().
413 	 *
414 	 * (e.g. xen_load_tls())
415 	 */
416 	savesegment(fs, fsindex);
417 	savesegment(gs, gsindex);
418 
419 	load_TLS(next, cpu);
420 
421 	/* Must be after DS reload */
422 	unlazy_fpu(prev_p);
423 
424 	/* Make sure cpu is ready for new context */
425 	if (preload_fpu)
426 		clts();
427 
428 	/*
429 	 * Leave lazy mode, flushing any hypercalls made here.
430 	 * This must be done before restoring TLS segments so
431 	 * the GDT and LDT are properly updated, and must be
432 	 * done before math_state_restore, so the TS bit is up
433 	 * to date.
434 	 */
435 	arch_end_context_switch(next_p);
436 
437 	/*
438 	 * Switch FS and GS.
439 	 *
440 	 * Segment register != 0 always requires a reload.  Also
441 	 * reload when it has changed.  When prev process used 64bit
442 	 * base always reload to avoid an information leak.
443 	 */
444 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
445 		loadsegment(fs, next->fsindex);
446 		/*
447 		 * Check if the user used a selector != 0; if yes
448 		 *  clear 64bit base, since overloaded base is always
449 		 *  mapped to the Null selector
450 		 */
451 		if (fsindex)
452 			prev->fs = 0;
453 	}
454 	/* when next process has a 64bit base use it */
455 	if (next->fs)
456 		wrmsrl(MSR_FS_BASE, next->fs);
457 	prev->fsindex = fsindex;
458 
459 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
460 		load_gs_index(next->gsindex);
461 		if (gsindex)
462 			prev->gs = 0;
463 	}
464 	if (next->gs)
465 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
466 	prev->gsindex = gsindex;
467 
468 	/*
469 	 * Switch the PDA and FPU contexts.
470 	 */
471 	prev->usersp = percpu_read(old_rsp);
472 	percpu_write(old_rsp, next->usersp);
473 	percpu_write(current_task, next_p);
474 
475 	percpu_write(kernel_stack,
476 		  (unsigned long)task_stack_page(next_p) +
477 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
478 
479 	/*
480 	 * Now maybe reload the debug registers and handle I/O bitmaps
481 	 */
482 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
483 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
484 		__switch_to_xtra(prev_p, next_p, tss);
485 
486 	/*
487 	 * Preload the FPU context, now that we've determined that the
488 	 * task is likely to be using it.
489 	 */
490 	if (preload_fpu)
491 		__math_state_restore();
492 
493 	return prev_p;
494 }
495 
496 void set_personality_64bit(void)
497 {
498 	/* inherit personality from parent */
499 
500 	/* Make sure to be in 64bit mode */
501 	clear_thread_flag(TIF_IA32);
502 
503 	/* TBD: overwrites user setup. Should have two bits.
504 	   But 64bit processes have always behaved this way,
505 	   so it's not too bad. The main problem is just that
506 	   32bit childs are affected again. */
507 	current->personality &= ~READ_IMPLIES_EXEC;
508 }
509 
510 void set_personality_ia32(void)
511 {
512 	/* inherit personality from parent */
513 
514 	/* Make sure to be in 32bit mode */
515 	set_thread_flag(TIF_IA32);
516 	current->personality |= force_personality32;
517 
518 	/* Prepare the first "return" to user space */
519 	current_thread_info()->status |= TS_COMPAT;
520 }
521 
522 unsigned long get_wchan(struct task_struct *p)
523 {
524 	unsigned long stack;
525 	u64 fp, ip;
526 	int count = 0;
527 
528 	if (!p || p == current || p->state == TASK_RUNNING)
529 		return 0;
530 	stack = (unsigned long)task_stack_page(p);
531 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
532 		return 0;
533 	fp = *(u64 *)(p->thread.sp);
534 	do {
535 		if (fp < (unsigned long)stack ||
536 		    fp >= (unsigned long)stack+THREAD_SIZE)
537 			return 0;
538 		ip = *(u64 *)(fp+8);
539 		if (!in_sched_functions(ip))
540 			return ip;
541 		fp = *(u64 *)fp;
542 	} while (count++ < 16);
543 	return 0;
544 }
545 
546 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
547 {
548 	int ret = 0;
549 	int doit = task == current;
550 	int cpu;
551 
552 	switch (code) {
553 	case ARCH_SET_GS:
554 		if (addr >= TASK_SIZE_OF(task))
555 			return -EPERM;
556 		cpu = get_cpu();
557 		/* handle small bases via the GDT because that's faster to
558 		   switch. */
559 		if (addr <= 0xffffffff) {
560 			set_32bit_tls(task, GS_TLS, addr);
561 			if (doit) {
562 				load_TLS(&task->thread, cpu);
563 				load_gs_index(GS_TLS_SEL);
564 			}
565 			task->thread.gsindex = GS_TLS_SEL;
566 			task->thread.gs = 0;
567 		} else {
568 			task->thread.gsindex = 0;
569 			task->thread.gs = addr;
570 			if (doit) {
571 				load_gs_index(0);
572 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
573 			}
574 		}
575 		put_cpu();
576 		break;
577 	case ARCH_SET_FS:
578 		/* Not strictly needed for fs, but do it for symmetry
579 		   with gs */
580 		if (addr >= TASK_SIZE_OF(task))
581 			return -EPERM;
582 		cpu = get_cpu();
583 		/* handle small bases via the GDT because that's faster to
584 		   switch. */
585 		if (addr <= 0xffffffff) {
586 			set_32bit_tls(task, FS_TLS, addr);
587 			if (doit) {
588 				load_TLS(&task->thread, cpu);
589 				loadsegment(fs, FS_TLS_SEL);
590 			}
591 			task->thread.fsindex = FS_TLS_SEL;
592 			task->thread.fs = 0;
593 		} else {
594 			task->thread.fsindex = 0;
595 			task->thread.fs = addr;
596 			if (doit) {
597 				/* set the selector to 0 to not confuse
598 				   __switch_to */
599 				loadsegment(fs, 0);
600 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
601 			}
602 		}
603 		put_cpu();
604 		break;
605 	case ARCH_GET_FS: {
606 		unsigned long base;
607 		if (task->thread.fsindex == FS_TLS_SEL)
608 			base = read_32bit_tls(task, FS_TLS);
609 		else if (doit)
610 			rdmsrl(MSR_FS_BASE, base);
611 		else
612 			base = task->thread.fs;
613 		ret = put_user(base, (unsigned long __user *)addr);
614 		break;
615 	}
616 	case ARCH_GET_GS: {
617 		unsigned long base;
618 		unsigned gsindex;
619 		if (task->thread.gsindex == GS_TLS_SEL)
620 			base = read_32bit_tls(task, GS_TLS);
621 		else if (doit) {
622 			savesegment(gs, gsindex);
623 			if (gsindex)
624 				rdmsrl(MSR_KERNEL_GS_BASE, base);
625 			else
626 				base = task->thread.gs;
627 		} else
628 			base = task->thread.gs;
629 		ret = put_user(base, (unsigned long __user *)addr);
630 		break;
631 	}
632 
633 	default:
634 		ret = -EINVAL;
635 		break;
636 	}
637 
638 	return ret;
639 }
640 
641 long sys_arch_prctl(int code, unsigned long addr)
642 {
643 	return do_arch_prctl(current, code, addr);
644 }
645 
646 unsigned long KSTK_ESP(struct task_struct *task)
647 {
648 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
649 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
650 }
651