xref: /linux/arch/x86/kernel/process_64.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/fpu/internal.h>
42 #include <asm/mmu_context.h>
43 #include <asm/prctl.h>
44 #include <asm/desc.h>
45 #include <asm/proto.h>
46 #include <asm/ia32.h>
47 #include <asm/idle.h>
48 #include <asm/syscalls.h>
49 #include <asm/debugreg.h>
50 #include <asm/switch_to.h>
51 #include <asm/xen/hypervisor.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 	unsigned long d0, d1, d2, d3, d6, d7;
62 	unsigned int fsindex, gsindex;
63 	unsigned int ds, cs, es;
64 
65 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66 	printk_address(regs->ip);
67 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
68 			regs->sp, regs->flags);
69 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70 	       regs->ax, regs->bx, regs->cx);
71 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72 	       regs->dx, regs->si, regs->di);
73 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74 	       regs->bp, regs->r8, regs->r9);
75 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76 	       regs->r10, regs->r11, regs->r12);
77 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78 	       regs->r13, regs->r14, regs->r15);
79 
80 	asm("movl %%ds,%0" : "=r" (ds));
81 	asm("movl %%cs,%0" : "=r" (cs));
82 	asm("movl %%es,%0" : "=r" (es));
83 	asm("movl %%fs,%0" : "=r" (fsindex));
84 	asm("movl %%gs,%0" : "=r" (gsindex));
85 
86 	rdmsrl(MSR_FS_BASE, fs);
87 	rdmsrl(MSR_GS_BASE, gs);
88 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
89 
90 	if (!all)
91 		return;
92 
93 	cr0 = read_cr0();
94 	cr2 = read_cr2();
95 	cr3 = read_cr3();
96 	cr4 = __read_cr4();
97 
98 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99 	       fs, fsindex, gs, gsindex, shadowgs);
100 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101 			es, cr0);
102 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103 			cr4);
104 
105 	get_debugreg(d0, 0);
106 	get_debugreg(d1, 1);
107 	get_debugreg(d2, 2);
108 	get_debugreg(d3, 3);
109 	get_debugreg(d6, 6);
110 	get_debugreg(d7, 7);
111 
112 	/* Only print out debug registers if they are in their non-default state. */
113 	if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
114 	    (d6 == DR6_RESERVED) && (d7 == 0x400))
115 		return;
116 
117 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
118 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
119 
120 	if (boot_cpu_has(X86_FEATURE_OSPKE))
121 		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
122 }
123 
124 void release_thread(struct task_struct *dead_task)
125 {
126 	if (dead_task->mm) {
127 #ifdef CONFIG_MODIFY_LDT_SYSCALL
128 		if (dead_task->mm->context.ldt) {
129 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
130 				dead_task->comm,
131 				dead_task->mm->context.ldt->entries,
132 				dead_task->mm->context.ldt->size);
133 			BUG();
134 		}
135 #endif
136 	}
137 }
138 
139 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
140 		unsigned long arg, struct task_struct *p, unsigned long tls)
141 {
142 	int err;
143 	struct pt_regs *childregs;
144 	struct task_struct *me = current;
145 
146 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
147 	childregs = task_pt_regs(p);
148 	p->thread.sp = (unsigned long) childregs;
149 	set_tsk_thread_flag(p, TIF_FORK);
150 	p->thread.io_bitmap_ptr = NULL;
151 
152 	savesegment(gs, p->thread.gsindex);
153 	p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
154 	savesegment(fs, p->thread.fsindex);
155 	p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
156 	savesegment(es, p->thread.es);
157 	savesegment(ds, p->thread.ds);
158 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
159 
160 	if (unlikely(p->flags & PF_KTHREAD)) {
161 		/* kernel thread */
162 		memset(childregs, 0, sizeof(struct pt_regs));
163 		childregs->sp = (unsigned long)childregs;
164 		childregs->ss = __KERNEL_DS;
165 		childregs->bx = sp; /* function */
166 		childregs->bp = arg;
167 		childregs->orig_ax = -1;
168 		childregs->cs = __KERNEL_CS | get_kernel_rpl();
169 		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
170 		return 0;
171 	}
172 	*childregs = *current_pt_regs();
173 
174 	childregs->ax = 0;
175 	if (sp)
176 		childregs->sp = sp;
177 
178 	err = -ENOMEM;
179 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
180 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
181 						  IO_BITMAP_BYTES, GFP_KERNEL);
182 		if (!p->thread.io_bitmap_ptr) {
183 			p->thread.io_bitmap_max = 0;
184 			return -ENOMEM;
185 		}
186 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
187 	}
188 
189 	/*
190 	 * Set a new TLS for the child thread?
191 	 */
192 	if (clone_flags & CLONE_SETTLS) {
193 #ifdef CONFIG_IA32_EMULATION
194 		if (in_ia32_syscall())
195 			err = do_set_thread_area(p, -1,
196 				(struct user_desc __user *)tls, 0);
197 		else
198 #endif
199 			err = do_arch_prctl(p, ARCH_SET_FS, tls);
200 		if (err)
201 			goto out;
202 	}
203 	err = 0;
204 out:
205 	if (err && p->thread.io_bitmap_ptr) {
206 		kfree(p->thread.io_bitmap_ptr);
207 		p->thread.io_bitmap_max = 0;
208 	}
209 
210 	return err;
211 }
212 
213 static void
214 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
215 		    unsigned long new_sp,
216 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
217 {
218 	loadsegment(fs, 0);
219 	loadsegment(es, _ds);
220 	loadsegment(ds, _ds);
221 	load_gs_index(0);
222 	regs->ip		= new_ip;
223 	regs->sp		= new_sp;
224 	regs->cs		= _cs;
225 	regs->ss		= _ss;
226 	regs->flags		= X86_EFLAGS_IF;
227 	force_iret();
228 }
229 
230 void
231 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
232 {
233 	start_thread_common(regs, new_ip, new_sp,
234 			    __USER_CS, __USER_DS, 0);
235 }
236 
237 #ifdef CONFIG_COMPAT
238 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
239 {
240 	start_thread_common(regs, new_ip, new_sp,
241 			    test_thread_flag(TIF_X32)
242 			    ? __USER_CS : __USER32_CS,
243 			    __USER_DS, __USER_DS);
244 }
245 #endif
246 
247 /*
248  *	switch_to(x,y) should switch tasks from x to y.
249  *
250  * This could still be optimized:
251  * - fold all the options into a flag word and test it with a single test.
252  * - could test fs/gs bitsliced
253  *
254  * Kprobes not supported here. Set the probe on schedule instead.
255  * Function graph tracer not supported too.
256  */
257 __visible __notrace_funcgraph struct task_struct *
258 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
259 {
260 	struct thread_struct *prev = &prev_p->thread;
261 	struct thread_struct *next = &next_p->thread;
262 	struct fpu *prev_fpu = &prev->fpu;
263 	struct fpu *next_fpu = &next->fpu;
264 	int cpu = smp_processor_id();
265 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
266 	unsigned prev_fsindex, prev_gsindex;
267 	fpu_switch_t fpu_switch;
268 
269 	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
270 
271 	/* We must save %fs and %gs before load_TLS() because
272 	 * %fs and %gs may be cleared by load_TLS().
273 	 *
274 	 * (e.g. xen_load_tls())
275 	 */
276 	savesegment(fs, prev_fsindex);
277 	savesegment(gs, prev_gsindex);
278 
279 	/*
280 	 * Load TLS before restoring any segments so that segment loads
281 	 * reference the correct GDT entries.
282 	 */
283 	load_TLS(next, cpu);
284 
285 	/*
286 	 * Leave lazy mode, flushing any hypercalls made here.  This
287 	 * must be done after loading TLS entries in the GDT but before
288 	 * loading segments that might reference them, and and it must
289 	 * be done before fpu__restore(), so the TS bit is up to
290 	 * date.
291 	 */
292 	arch_end_context_switch(next_p);
293 
294 	/* Switch DS and ES.
295 	 *
296 	 * Reading them only returns the selectors, but writing them (if
297 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
298 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
299 	 * above.
300 	 *
301 	 * We therefore need to write new values to the segment
302 	 * registers on every context switch unless both the new and old
303 	 * values are zero.
304 	 *
305 	 * Note that we don't need to do anything for CS and SS, as
306 	 * those are saved and restored as part of pt_regs.
307 	 */
308 	savesegment(es, prev->es);
309 	if (unlikely(next->es | prev->es))
310 		loadsegment(es, next->es);
311 
312 	savesegment(ds, prev->ds);
313 	if (unlikely(next->ds | prev->ds))
314 		loadsegment(ds, next->ds);
315 
316 	/*
317 	 * Switch FS and GS.
318 	 *
319 	 * These are even more complicated than DS and ES: they have
320 	 * 64-bit bases are that controlled by arch_prctl.  The bases
321 	 * don't necessarily match the selectors, as user code can do
322 	 * any number of things to cause them to be inconsistent.
323 	 *
324 	 * We don't promise to preserve the bases if the selectors are
325 	 * nonzero.  We also don't promise to preserve the base if the
326 	 * selector is zero and the base doesn't match whatever was
327 	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
328 	 * FSGSBASE instructions are enabled, we'll need to offer
329 	 * stronger guarantees.)
330 	 *
331 	 * As an invariant,
332 	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
333 	 * impossible.
334 	 */
335 	if (next->fsindex) {
336 		/* Loading a nonzero value into FS sets the index and base. */
337 		loadsegment(fs, next->fsindex);
338 	} else {
339 		if (next->fsbase) {
340 			/* Next index is zero but next base is nonzero. */
341 			if (prev_fsindex)
342 				loadsegment(fs, 0);
343 			wrmsrl(MSR_FS_BASE, next->fsbase);
344 		} else {
345 			/* Next base and index are both zero. */
346 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
347 				/*
348 				 * We don't know the previous base and can't
349 				 * find out without RDMSR.  Forcibly clear it.
350 				 */
351 				loadsegment(fs, __USER_DS);
352 				loadsegment(fs, 0);
353 			} else {
354 				/*
355 				 * If the previous index is zero and ARCH_SET_FS
356 				 * didn't change the base, then the base is
357 				 * also zero and we don't need to do anything.
358 				 */
359 				if (prev->fsbase || prev_fsindex)
360 					loadsegment(fs, 0);
361 			}
362 		}
363 	}
364 	/*
365 	 * Save the old state and preserve the invariant.
366 	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
367 	 * without RDMSR because Intel user code can zero it without telling
368 	 * us and AMD user code can program any 32-bit value without telling
369 	 * us.
370 	 */
371 	if (prev_fsindex)
372 		prev->fsbase = 0;
373 	prev->fsindex = prev_fsindex;
374 
375 	if (next->gsindex) {
376 		/* Loading a nonzero value into GS sets the index and base. */
377 		load_gs_index(next->gsindex);
378 	} else {
379 		if (next->gsbase) {
380 			/* Next index is zero but next base is nonzero. */
381 			if (prev_gsindex)
382 				load_gs_index(0);
383 			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
384 		} else {
385 			/* Next base and index are both zero. */
386 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
387 				/*
388 				 * We don't know the previous base and can't
389 				 * find out without RDMSR.  Forcibly clear it.
390 				 *
391 				 * This contains a pointless SWAPGS pair.
392 				 * Fixing it would involve an explicit check
393 				 * for Xen or a new pvop.
394 				 */
395 				load_gs_index(__USER_DS);
396 				load_gs_index(0);
397 			} else {
398 				/*
399 				 * If the previous index is zero and ARCH_SET_GS
400 				 * didn't change the base, then the base is
401 				 * also zero and we don't need to do anything.
402 				 */
403 				if (prev->gsbase || prev_gsindex)
404 					load_gs_index(0);
405 			}
406 		}
407 	}
408 	/*
409 	 * Save the old state and preserve the invariant.
410 	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
411 	 * without RDMSR because Intel user code can zero it without telling
412 	 * us and AMD user code can program any 32-bit value without telling
413 	 * us.
414 	 */
415 	if (prev_gsindex)
416 		prev->gsbase = 0;
417 	prev->gsindex = prev_gsindex;
418 
419 	switch_fpu_finish(next_fpu, fpu_switch);
420 
421 	/*
422 	 * Switch the PDA and FPU contexts.
423 	 */
424 	this_cpu_write(current_task, next_p);
425 
426 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
427 	load_sp0(tss, next);
428 
429 	/*
430 	 * Now maybe reload the debug registers and handle I/O bitmaps
431 	 */
432 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
433 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
434 		__switch_to_xtra(prev_p, next_p, tss);
435 
436 #ifdef CONFIG_XEN
437 	/*
438 	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
439 	 * current_pt_regs()->flags may not match the current task's
440 	 * intended IOPL.  We need to switch it manually.
441 	 */
442 	if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
443 		     prev->iopl != next->iopl))
444 		xen_set_iopl_mask(next->iopl);
445 #endif
446 
447 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
448 		/*
449 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
450 		 * does not update the cached descriptor.  As a result, if we
451 		 * do SYSRET while SS is NULL, we'll end up in user mode with
452 		 * SS apparently equal to __USER_DS but actually unusable.
453 		 *
454 		 * The straightforward workaround would be to fix it up just
455 		 * before SYSRET, but that would slow down the system call
456 		 * fast paths.  Instead, we ensure that SS is never NULL in
457 		 * system call context.  We do this by replacing NULL SS
458 		 * selectors at every context switch.  SYSCALL sets up a valid
459 		 * SS, so the only way to get NULL is to re-enter the kernel
460 		 * from CPL 3 through an interrupt.  Since that can't happen
461 		 * in the same task as a running syscall, we are guaranteed to
462 		 * context switch between every interrupt vector entry and a
463 		 * subsequent SYSRET.
464 		 *
465 		 * We read SS first because SS reads are much faster than
466 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
467 		 * it previously had a different non-NULL value.
468 		 */
469 		unsigned short ss_sel;
470 		savesegment(ss, ss_sel);
471 		if (ss_sel != __KERNEL_DS)
472 			loadsegment(ss, __KERNEL_DS);
473 	}
474 
475 	return prev_p;
476 }
477 
478 void set_personality_64bit(void)
479 {
480 	/* inherit personality from parent */
481 
482 	/* Make sure to be in 64bit mode */
483 	clear_thread_flag(TIF_IA32);
484 	clear_thread_flag(TIF_ADDR32);
485 	clear_thread_flag(TIF_X32);
486 
487 	/* Ensure the corresponding mm is not marked. */
488 	if (current->mm)
489 		current->mm->context.ia32_compat = 0;
490 
491 	/* TBD: overwrites user setup. Should have two bits.
492 	   But 64bit processes have always behaved this way,
493 	   so it's not too bad. The main problem is just that
494 	   32bit childs are affected again. */
495 	current->personality &= ~READ_IMPLIES_EXEC;
496 }
497 
498 void set_personality_ia32(bool x32)
499 {
500 	/* inherit personality from parent */
501 
502 	/* Make sure to be in 32bit mode */
503 	set_thread_flag(TIF_ADDR32);
504 
505 	/* Mark the associated mm as containing 32-bit tasks. */
506 	if (x32) {
507 		clear_thread_flag(TIF_IA32);
508 		set_thread_flag(TIF_X32);
509 		if (current->mm)
510 			current->mm->context.ia32_compat = TIF_X32;
511 		current->personality &= ~READ_IMPLIES_EXEC;
512 		/* in_compat_syscall() uses the presence of the x32
513 		   syscall bit flag to determine compat status */
514 		current_thread_info()->status &= ~TS_COMPAT;
515 	} else {
516 		set_thread_flag(TIF_IA32);
517 		clear_thread_flag(TIF_X32);
518 		if (current->mm)
519 			current->mm->context.ia32_compat = TIF_IA32;
520 		current->personality |= force_personality32;
521 		/* Prepare the first "return" to user space */
522 		current_thread_info()->status |= TS_COMPAT;
523 	}
524 }
525 EXPORT_SYMBOL_GPL(set_personality_ia32);
526 
527 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
528 {
529 	int ret = 0;
530 	int doit = task == current;
531 	int cpu;
532 
533 	switch (code) {
534 	case ARCH_SET_GS:
535 		if (addr >= TASK_SIZE_MAX)
536 			return -EPERM;
537 		cpu = get_cpu();
538 		task->thread.gsindex = 0;
539 		task->thread.gsbase = addr;
540 		if (doit) {
541 			load_gs_index(0);
542 			ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
543 		}
544 		put_cpu();
545 		break;
546 	case ARCH_SET_FS:
547 		/* Not strictly needed for fs, but do it for symmetry
548 		   with gs */
549 		if (addr >= TASK_SIZE_MAX)
550 			return -EPERM;
551 		cpu = get_cpu();
552 		task->thread.fsindex = 0;
553 		task->thread.fsbase = addr;
554 		if (doit) {
555 			/* set the selector to 0 to not confuse __switch_to */
556 			loadsegment(fs, 0);
557 			ret = wrmsrl_safe(MSR_FS_BASE, addr);
558 		}
559 		put_cpu();
560 		break;
561 	case ARCH_GET_FS: {
562 		unsigned long base;
563 		if (doit)
564 			rdmsrl(MSR_FS_BASE, base);
565 		else
566 			base = task->thread.fsbase;
567 		ret = put_user(base, (unsigned long __user *)addr);
568 		break;
569 	}
570 	case ARCH_GET_GS: {
571 		unsigned long base;
572 		if (doit)
573 			rdmsrl(MSR_KERNEL_GS_BASE, base);
574 		else
575 			base = task->thread.gsbase;
576 		ret = put_user(base, (unsigned long __user *)addr);
577 		break;
578 	}
579 
580 	default:
581 		ret = -EINVAL;
582 		break;
583 	}
584 
585 	return ret;
586 }
587 
588 long sys_arch_prctl(int code, unsigned long addr)
589 {
590 	return do_arch_prctl(current, code, addr);
591 }
592 
593 unsigned long KSTK_ESP(struct task_struct *task)
594 {
595 	return task_pt_regs(task)->sp;
596 }
597