xref: /linux/arch/x86/kernel/process_64.c (revision b889fcf63cb62e7fdb7816565e28f44dbe4a76a5)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 	unsigned long d0, d1, d2, d3, d6, d7;
62 	unsigned int fsindex, gsindex;
63 	unsigned int ds, cs, es;
64 
65 	show_regs_common();
66 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67 	printk_address(regs->ip, 1);
68 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
69 			regs->sp, regs->flags);
70 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71 	       regs->ax, regs->bx, regs->cx);
72 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73 	       regs->dx, regs->si, regs->di);
74 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75 	       regs->bp, regs->r8, regs->r9);
76 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77 	       regs->r10, regs->r11, regs->r12);
78 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79 	       regs->r13, regs->r14, regs->r15);
80 
81 	asm("movl %%ds,%0" : "=r" (ds));
82 	asm("movl %%cs,%0" : "=r" (cs));
83 	asm("movl %%es,%0" : "=r" (es));
84 	asm("movl %%fs,%0" : "=r" (fsindex));
85 	asm("movl %%gs,%0" : "=r" (gsindex));
86 
87 	rdmsrl(MSR_FS_BASE, fs);
88 	rdmsrl(MSR_GS_BASE, gs);
89 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90 
91 	if (!all)
92 		return;
93 
94 	cr0 = read_cr0();
95 	cr2 = read_cr2();
96 	cr3 = read_cr3();
97 	cr4 = read_cr4();
98 
99 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100 	       fs, fsindex, gs, gsindex, shadowgs);
101 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102 			es, cr0);
103 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104 			cr4);
105 
106 	get_debugreg(d0, 0);
107 	get_debugreg(d1, 1);
108 	get_debugreg(d2, 2);
109 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110 	get_debugreg(d3, 3);
111 	get_debugreg(d6, 6);
112 	get_debugreg(d7, 7);
113 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115 
116 void release_thread(struct task_struct *dead_task)
117 {
118 	if (dead_task->mm) {
119 		if (dead_task->mm->context.size) {
120 			pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 				dead_task->comm,
122 				dead_task->mm->context.ldt,
123 				dead_task->mm->context.size);
124 			BUG();
125 		}
126 	}
127 }
128 
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131 	struct user_desc ud = {
132 		.base_addr = addr,
133 		.limit = 0xfffff,
134 		.seg_32bit = 1,
135 		.limit_in_pages = 1,
136 		.useable = 1,
137 	};
138 	struct desc_struct *desc = t->thread.tls_array;
139 	desc += tls;
140 	fill_ldt(desc, &ud);
141 }
142 
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145 	return get_desc_base(&t->thread.tls_array[tls]);
146 }
147 
148 int copy_thread(unsigned long clone_flags, unsigned long sp,
149 		unsigned long arg, struct task_struct *p)
150 {
151 	int err;
152 	struct pt_regs *childregs;
153 	struct task_struct *me = current;
154 
155 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
156 	childregs = task_pt_regs(p);
157 	p->thread.sp = (unsigned long) childregs;
158 	p->thread.usersp = me->thread.usersp;
159 	set_tsk_thread_flag(p, TIF_FORK);
160 	p->fpu_counter = 0;
161 	p->thread.io_bitmap_ptr = NULL;
162 
163 	savesegment(gs, p->thread.gsindex);
164 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
165 	savesegment(fs, p->thread.fsindex);
166 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
167 	savesegment(es, p->thread.es);
168 	savesegment(ds, p->thread.ds);
169 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
170 
171 	if (unlikely(p->flags & PF_KTHREAD)) {
172 		/* kernel thread */
173 		memset(childregs, 0, sizeof(struct pt_regs));
174 		childregs->sp = (unsigned long)childregs;
175 		childregs->ss = __KERNEL_DS;
176 		childregs->bx = sp; /* function */
177 		childregs->bp = arg;
178 		childregs->orig_ax = -1;
179 		childregs->cs = __KERNEL_CS | get_kernel_rpl();
180 		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
181 		return 0;
182 	}
183 	*childregs = *current_pt_regs();
184 
185 	childregs->ax = 0;
186 	if (sp)
187 		childregs->sp = sp;
188 
189 	err = -ENOMEM;
190 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
191 
192 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
193 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
194 						  IO_BITMAP_BYTES, GFP_KERNEL);
195 		if (!p->thread.io_bitmap_ptr) {
196 			p->thread.io_bitmap_max = 0;
197 			return -ENOMEM;
198 		}
199 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
200 	}
201 
202 	/*
203 	 * Set a new TLS for the child thread?
204 	 */
205 	if (clone_flags & CLONE_SETTLS) {
206 #ifdef CONFIG_IA32_EMULATION
207 		if (test_thread_flag(TIF_IA32))
208 			err = do_set_thread_area(p, -1,
209 				(struct user_desc __user *)childregs->si, 0);
210 		else
211 #endif
212 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
213 		if (err)
214 			goto out;
215 	}
216 	err = 0;
217 out:
218 	if (err && p->thread.io_bitmap_ptr) {
219 		kfree(p->thread.io_bitmap_ptr);
220 		p->thread.io_bitmap_max = 0;
221 	}
222 
223 	return err;
224 }
225 
226 static void
227 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
228 		    unsigned long new_sp,
229 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
230 {
231 	loadsegment(fs, 0);
232 	loadsegment(es, _ds);
233 	loadsegment(ds, _ds);
234 	load_gs_index(0);
235 	current->thread.usersp	= new_sp;
236 	regs->ip		= new_ip;
237 	regs->sp		= new_sp;
238 	this_cpu_write(old_rsp, new_sp);
239 	regs->cs		= _cs;
240 	regs->ss		= _ss;
241 	regs->flags		= X86_EFLAGS_IF;
242 }
243 
244 void
245 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
246 {
247 	start_thread_common(regs, new_ip, new_sp,
248 			    __USER_CS, __USER_DS, 0);
249 }
250 
251 #ifdef CONFIG_IA32_EMULATION
252 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253 {
254 	start_thread_common(regs, new_ip, new_sp,
255 			    test_thread_flag(TIF_X32)
256 			    ? __USER_CS : __USER32_CS,
257 			    __USER_DS, __USER_DS);
258 }
259 #endif
260 
261 /*
262  *	switch_to(x,y) should switch tasks from x to y.
263  *
264  * This could still be optimized:
265  * - fold all the options into a flag word and test it with a single test.
266  * - could test fs/gs bitsliced
267  *
268  * Kprobes not supported here. Set the probe on schedule instead.
269  * Function graph tracer not supported too.
270  */
271 __notrace_funcgraph struct task_struct *
272 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
273 {
274 	struct thread_struct *prev = &prev_p->thread;
275 	struct thread_struct *next = &next_p->thread;
276 	int cpu = smp_processor_id();
277 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
278 	unsigned fsindex, gsindex;
279 	fpu_switch_t fpu;
280 
281 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
282 
283 	/*
284 	 * Reload esp0, LDT and the page table pointer:
285 	 */
286 	load_sp0(tss, next);
287 
288 	/*
289 	 * Switch DS and ES.
290 	 * This won't pick up thread selector changes, but I guess that is ok.
291 	 */
292 	savesegment(es, prev->es);
293 	if (unlikely(next->es | prev->es))
294 		loadsegment(es, next->es);
295 
296 	savesegment(ds, prev->ds);
297 	if (unlikely(next->ds | prev->ds))
298 		loadsegment(ds, next->ds);
299 
300 
301 	/* We must save %fs and %gs before load_TLS() because
302 	 * %fs and %gs may be cleared by load_TLS().
303 	 *
304 	 * (e.g. xen_load_tls())
305 	 */
306 	savesegment(fs, fsindex);
307 	savesegment(gs, gsindex);
308 
309 	load_TLS(next, cpu);
310 
311 	/*
312 	 * Leave lazy mode, flushing any hypercalls made here.
313 	 * This must be done before restoring TLS segments so
314 	 * the GDT and LDT are properly updated, and must be
315 	 * done before math_state_restore, so the TS bit is up
316 	 * to date.
317 	 */
318 	arch_end_context_switch(next_p);
319 
320 	/*
321 	 * Switch FS and GS.
322 	 *
323 	 * Segment register != 0 always requires a reload.  Also
324 	 * reload when it has changed.  When prev process used 64bit
325 	 * base always reload to avoid an information leak.
326 	 */
327 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
328 		loadsegment(fs, next->fsindex);
329 		/*
330 		 * Check if the user used a selector != 0; if yes
331 		 *  clear 64bit base, since overloaded base is always
332 		 *  mapped to the Null selector
333 		 */
334 		if (fsindex)
335 			prev->fs = 0;
336 	}
337 	/* when next process has a 64bit base use it */
338 	if (next->fs)
339 		wrmsrl(MSR_FS_BASE, next->fs);
340 	prev->fsindex = fsindex;
341 
342 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
343 		load_gs_index(next->gsindex);
344 		if (gsindex)
345 			prev->gs = 0;
346 	}
347 	if (next->gs)
348 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
349 	prev->gsindex = gsindex;
350 
351 	switch_fpu_finish(next_p, fpu);
352 
353 	/*
354 	 * Switch the PDA and FPU contexts.
355 	 */
356 	prev->usersp = this_cpu_read(old_rsp);
357 	this_cpu_write(old_rsp, next->usersp);
358 	this_cpu_write(current_task, next_p);
359 
360 	this_cpu_write(kernel_stack,
361 		  (unsigned long)task_stack_page(next_p) +
362 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
363 
364 	/*
365 	 * Now maybe reload the debug registers and handle I/O bitmaps
366 	 */
367 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
368 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
369 		__switch_to_xtra(prev_p, next_p, tss);
370 
371 	return prev_p;
372 }
373 
374 void set_personality_64bit(void)
375 {
376 	/* inherit personality from parent */
377 
378 	/* Make sure to be in 64bit mode */
379 	clear_thread_flag(TIF_IA32);
380 	clear_thread_flag(TIF_ADDR32);
381 	clear_thread_flag(TIF_X32);
382 
383 	/* Ensure the corresponding mm is not marked. */
384 	if (current->mm)
385 		current->mm->context.ia32_compat = 0;
386 
387 	/* TBD: overwrites user setup. Should have two bits.
388 	   But 64bit processes have always behaved this way,
389 	   so it's not too bad. The main problem is just that
390 	   32bit childs are affected again. */
391 	current->personality &= ~READ_IMPLIES_EXEC;
392 }
393 
394 void set_personality_ia32(bool x32)
395 {
396 	/* inherit personality from parent */
397 
398 	/* Make sure to be in 32bit mode */
399 	set_thread_flag(TIF_ADDR32);
400 
401 	/* Mark the associated mm as containing 32-bit tasks. */
402 	if (current->mm)
403 		current->mm->context.ia32_compat = 1;
404 
405 	if (x32) {
406 		clear_thread_flag(TIF_IA32);
407 		set_thread_flag(TIF_X32);
408 		current->personality &= ~READ_IMPLIES_EXEC;
409 		/* is_compat_task() uses the presence of the x32
410 		   syscall bit flag to determine compat status */
411 		current_thread_info()->status &= ~TS_COMPAT;
412 	} else {
413 		set_thread_flag(TIF_IA32);
414 		clear_thread_flag(TIF_X32);
415 		current->personality |= force_personality32;
416 		/* Prepare the first "return" to user space */
417 		current_thread_info()->status |= TS_COMPAT;
418 	}
419 }
420 EXPORT_SYMBOL_GPL(set_personality_ia32);
421 
422 unsigned long get_wchan(struct task_struct *p)
423 {
424 	unsigned long stack;
425 	u64 fp, ip;
426 	int count = 0;
427 
428 	if (!p || p == current || p->state == TASK_RUNNING)
429 		return 0;
430 	stack = (unsigned long)task_stack_page(p);
431 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
432 		return 0;
433 	fp = *(u64 *)(p->thread.sp);
434 	do {
435 		if (fp < (unsigned long)stack ||
436 		    fp >= (unsigned long)stack+THREAD_SIZE)
437 			return 0;
438 		ip = *(u64 *)(fp+8);
439 		if (!in_sched_functions(ip))
440 			return ip;
441 		fp = *(u64 *)fp;
442 	} while (count++ < 16);
443 	return 0;
444 }
445 
446 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
447 {
448 	int ret = 0;
449 	int doit = task == current;
450 	int cpu;
451 
452 	switch (code) {
453 	case ARCH_SET_GS:
454 		if (addr >= TASK_SIZE_OF(task))
455 			return -EPERM;
456 		cpu = get_cpu();
457 		/* handle small bases via the GDT because that's faster to
458 		   switch. */
459 		if (addr <= 0xffffffff) {
460 			set_32bit_tls(task, GS_TLS, addr);
461 			if (doit) {
462 				load_TLS(&task->thread, cpu);
463 				load_gs_index(GS_TLS_SEL);
464 			}
465 			task->thread.gsindex = GS_TLS_SEL;
466 			task->thread.gs = 0;
467 		} else {
468 			task->thread.gsindex = 0;
469 			task->thread.gs = addr;
470 			if (doit) {
471 				load_gs_index(0);
472 				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
473 			}
474 		}
475 		put_cpu();
476 		break;
477 	case ARCH_SET_FS:
478 		/* Not strictly needed for fs, but do it for symmetry
479 		   with gs */
480 		if (addr >= TASK_SIZE_OF(task))
481 			return -EPERM;
482 		cpu = get_cpu();
483 		/* handle small bases via the GDT because that's faster to
484 		   switch. */
485 		if (addr <= 0xffffffff) {
486 			set_32bit_tls(task, FS_TLS, addr);
487 			if (doit) {
488 				load_TLS(&task->thread, cpu);
489 				loadsegment(fs, FS_TLS_SEL);
490 			}
491 			task->thread.fsindex = FS_TLS_SEL;
492 			task->thread.fs = 0;
493 		} else {
494 			task->thread.fsindex = 0;
495 			task->thread.fs = addr;
496 			if (doit) {
497 				/* set the selector to 0 to not confuse
498 				   __switch_to */
499 				loadsegment(fs, 0);
500 				ret = wrmsrl_safe(MSR_FS_BASE, addr);
501 			}
502 		}
503 		put_cpu();
504 		break;
505 	case ARCH_GET_FS: {
506 		unsigned long base;
507 		if (task->thread.fsindex == FS_TLS_SEL)
508 			base = read_32bit_tls(task, FS_TLS);
509 		else if (doit)
510 			rdmsrl(MSR_FS_BASE, base);
511 		else
512 			base = task->thread.fs;
513 		ret = put_user(base, (unsigned long __user *)addr);
514 		break;
515 	}
516 	case ARCH_GET_GS: {
517 		unsigned long base;
518 		unsigned gsindex;
519 		if (task->thread.gsindex == GS_TLS_SEL)
520 			base = read_32bit_tls(task, GS_TLS);
521 		else if (doit) {
522 			savesegment(gs, gsindex);
523 			if (gsindex)
524 				rdmsrl(MSR_KERNEL_GS_BASE, base);
525 			else
526 				base = task->thread.gs;
527 		} else
528 			base = task->thread.gs;
529 		ret = put_user(base, (unsigned long __user *)addr);
530 		break;
531 	}
532 
533 	default:
534 		ret = -EINVAL;
535 		break;
536 	}
537 
538 	return ret;
539 }
540 
541 long sys_arch_prctl(int code, unsigned long addr)
542 {
543 	return do_arch_prctl(current, code, addr);
544 }
545 
546 unsigned long KSTK_ESP(struct task_struct *task)
547 {
548 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
549 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
550 }
551