xref: /linux/arch/x86/kernel/process_64.c (revision 4413e16d9d21673bb5048a2e542f1aaa00015c2e)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 	unsigned long d0, d1, d2, d3, d6, d7;
62 	unsigned int fsindex, gsindex;
63 	unsigned int ds, cs, es;
64 
65 	show_regs_common();
66 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67 	printk_address(regs->ip, 1);
68 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
69 			regs->sp, regs->flags);
70 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71 	       regs->ax, regs->bx, regs->cx);
72 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73 	       regs->dx, regs->si, regs->di);
74 	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75 	       regs->bp, regs->r8, regs->r9);
76 	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77 	       regs->r10, regs->r11, regs->r12);
78 	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79 	       regs->r13, regs->r14, regs->r15);
80 
81 	asm("movl %%ds,%0" : "=r" (ds));
82 	asm("movl %%cs,%0" : "=r" (cs));
83 	asm("movl %%es,%0" : "=r" (es));
84 	asm("movl %%fs,%0" : "=r" (fsindex));
85 	asm("movl %%gs,%0" : "=r" (gsindex));
86 
87 	rdmsrl(MSR_FS_BASE, fs);
88 	rdmsrl(MSR_GS_BASE, gs);
89 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90 
91 	if (!all)
92 		return;
93 
94 	cr0 = read_cr0();
95 	cr2 = read_cr2();
96 	cr3 = read_cr3();
97 	cr4 = read_cr4();
98 
99 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100 	       fs, fsindex, gs, gsindex, shadowgs);
101 	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102 			es, cr0);
103 	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104 			cr4);
105 
106 	get_debugreg(d0, 0);
107 	get_debugreg(d1, 1);
108 	get_debugreg(d2, 2);
109 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110 	get_debugreg(d3, 3);
111 	get_debugreg(d6, 6);
112 	get_debugreg(d7, 7);
113 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115 
116 void release_thread(struct task_struct *dead_task)
117 {
118 	if (dead_task->mm) {
119 		if (dead_task->mm->context.size) {
120 			pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 				dead_task->comm,
122 				dead_task->mm->context.ldt,
123 				dead_task->mm->context.size);
124 			BUG();
125 		}
126 	}
127 }
128 
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131 	struct user_desc ud = {
132 		.base_addr = addr,
133 		.limit = 0xfffff,
134 		.seg_32bit = 1,
135 		.limit_in_pages = 1,
136 		.useable = 1,
137 	};
138 	struct desc_struct *desc = t->thread.tls_array;
139 	desc += tls;
140 	fill_ldt(desc, &ud);
141 }
142 
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145 	return get_desc_base(&t->thread.tls_array[tls]);
146 }
147 
148 int copy_thread(unsigned long clone_flags, unsigned long sp,
149 		unsigned long unused,
150 	struct task_struct *p, struct pt_regs *regs)
151 {
152 	int err;
153 	struct pt_regs *childregs;
154 	struct task_struct *me = current;
155 
156 	childregs = ((struct pt_regs *)
157 			(THREAD_SIZE + task_stack_page(p))) - 1;
158 	*childregs = *regs;
159 
160 	childregs->ax = 0;
161 	if (user_mode(regs))
162 		childregs->sp = sp;
163 	else
164 		childregs->sp = (unsigned long)childregs;
165 
166 	p->thread.sp = (unsigned long) childregs;
167 	p->thread.sp0 = (unsigned long) (childregs+1);
168 	p->thread.usersp = me->thread.usersp;
169 
170 	set_tsk_thread_flag(p, TIF_FORK);
171 
172 	p->fpu_counter = 0;
173 	p->thread.io_bitmap_ptr = NULL;
174 
175 	savesegment(gs, p->thread.gsindex);
176 	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
177 	savesegment(fs, p->thread.fsindex);
178 	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
179 	savesegment(es, p->thread.es);
180 	savesegment(ds, p->thread.ds);
181 
182 	err = -ENOMEM;
183 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
184 
185 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
186 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
187 						  IO_BITMAP_BYTES, GFP_KERNEL);
188 		if (!p->thread.io_bitmap_ptr) {
189 			p->thread.io_bitmap_max = 0;
190 			return -ENOMEM;
191 		}
192 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
193 	}
194 
195 	/*
196 	 * Set a new TLS for the child thread?
197 	 */
198 	if (clone_flags & CLONE_SETTLS) {
199 #ifdef CONFIG_IA32_EMULATION
200 		if (test_thread_flag(TIF_IA32))
201 			err = do_set_thread_area(p, -1,
202 				(struct user_desc __user *)childregs->si, 0);
203 		else
204 #endif
205 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
206 		if (err)
207 			goto out;
208 	}
209 	err = 0;
210 out:
211 	if (err && p->thread.io_bitmap_ptr) {
212 		kfree(p->thread.io_bitmap_ptr);
213 		p->thread.io_bitmap_max = 0;
214 	}
215 
216 	return err;
217 }
218 
219 static void
220 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
221 		    unsigned long new_sp,
222 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
223 {
224 	loadsegment(fs, 0);
225 	loadsegment(es, _ds);
226 	loadsegment(ds, _ds);
227 	load_gs_index(0);
228 	current->thread.usersp	= new_sp;
229 	regs->ip		= new_ip;
230 	regs->sp		= new_sp;
231 	this_cpu_write(old_rsp, new_sp);
232 	regs->cs		= _cs;
233 	regs->ss		= _ss;
234 	regs->flags		= X86_EFLAGS_IF;
235 }
236 
237 void
238 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
239 {
240 	start_thread_common(regs, new_ip, new_sp,
241 			    __USER_CS, __USER_DS, 0);
242 }
243 
244 #ifdef CONFIG_IA32_EMULATION
245 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
246 {
247 	start_thread_common(regs, new_ip, new_sp,
248 			    test_thread_flag(TIF_X32)
249 			    ? __USER_CS : __USER32_CS,
250 			    __USER_DS, __USER_DS);
251 }
252 #endif
253 
254 /*
255  *	switch_to(x,y) should switch tasks from x to y.
256  *
257  * This could still be optimized:
258  * - fold all the options into a flag word and test it with a single test.
259  * - could test fs/gs bitsliced
260  *
261  * Kprobes not supported here. Set the probe on schedule instead.
262  * Function graph tracer not supported too.
263  */
264 __notrace_funcgraph struct task_struct *
265 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
266 {
267 	struct thread_struct *prev = &prev_p->thread;
268 	struct thread_struct *next = &next_p->thread;
269 	int cpu = smp_processor_id();
270 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
271 	unsigned fsindex, gsindex;
272 	fpu_switch_t fpu;
273 
274 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
275 
276 	/*
277 	 * Reload esp0, LDT and the page table pointer:
278 	 */
279 	load_sp0(tss, next);
280 
281 	/*
282 	 * Switch DS and ES.
283 	 * This won't pick up thread selector changes, but I guess that is ok.
284 	 */
285 	savesegment(es, prev->es);
286 	if (unlikely(next->es | prev->es))
287 		loadsegment(es, next->es);
288 
289 	savesegment(ds, prev->ds);
290 	if (unlikely(next->ds | prev->ds))
291 		loadsegment(ds, next->ds);
292 
293 
294 	/* We must save %fs and %gs before load_TLS() because
295 	 * %fs and %gs may be cleared by load_TLS().
296 	 *
297 	 * (e.g. xen_load_tls())
298 	 */
299 	savesegment(fs, fsindex);
300 	savesegment(gs, gsindex);
301 
302 	load_TLS(next, cpu);
303 
304 	/*
305 	 * Leave lazy mode, flushing any hypercalls made here.
306 	 * This must be done before restoring TLS segments so
307 	 * the GDT and LDT are properly updated, and must be
308 	 * done before math_state_restore, so the TS bit is up
309 	 * to date.
310 	 */
311 	arch_end_context_switch(next_p);
312 
313 	/*
314 	 * Switch FS and GS.
315 	 *
316 	 * Segment register != 0 always requires a reload.  Also
317 	 * reload when it has changed.  When prev process used 64bit
318 	 * base always reload to avoid an information leak.
319 	 */
320 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
321 		loadsegment(fs, next->fsindex);
322 		/*
323 		 * Check if the user used a selector != 0; if yes
324 		 *  clear 64bit base, since overloaded base is always
325 		 *  mapped to the Null selector
326 		 */
327 		if (fsindex)
328 			prev->fs = 0;
329 	}
330 	/* when next process has a 64bit base use it */
331 	if (next->fs)
332 		wrmsrl(MSR_FS_BASE, next->fs);
333 	prev->fsindex = fsindex;
334 
335 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
336 		load_gs_index(next->gsindex);
337 		if (gsindex)
338 			prev->gs = 0;
339 	}
340 	if (next->gs)
341 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
342 	prev->gsindex = gsindex;
343 
344 	switch_fpu_finish(next_p, fpu);
345 
346 	/*
347 	 * Switch the PDA and FPU contexts.
348 	 */
349 	prev->usersp = this_cpu_read(old_rsp);
350 	this_cpu_write(old_rsp, next->usersp);
351 	this_cpu_write(current_task, next_p);
352 
353 	this_cpu_write(kernel_stack,
354 		  (unsigned long)task_stack_page(next_p) +
355 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
356 
357 	/*
358 	 * Now maybe reload the debug registers and handle I/O bitmaps
359 	 */
360 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
361 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
362 		__switch_to_xtra(prev_p, next_p, tss);
363 
364 	return prev_p;
365 }
366 
367 void set_personality_64bit(void)
368 {
369 	/* inherit personality from parent */
370 
371 	/* Make sure to be in 64bit mode */
372 	clear_thread_flag(TIF_IA32);
373 	clear_thread_flag(TIF_ADDR32);
374 	clear_thread_flag(TIF_X32);
375 
376 	/* Ensure the corresponding mm is not marked. */
377 	if (current->mm)
378 		current->mm->context.ia32_compat = 0;
379 
380 	/* TBD: overwrites user setup. Should have two bits.
381 	   But 64bit processes have always behaved this way,
382 	   so it's not too bad. The main problem is just that
383 	   32bit childs are affected again. */
384 	current->personality &= ~READ_IMPLIES_EXEC;
385 }
386 
387 void set_personality_ia32(bool x32)
388 {
389 	/* inherit personality from parent */
390 
391 	/* Make sure to be in 32bit mode */
392 	set_thread_flag(TIF_ADDR32);
393 
394 	/* Mark the associated mm as containing 32-bit tasks. */
395 	if (current->mm)
396 		current->mm->context.ia32_compat = 1;
397 
398 	if (x32) {
399 		clear_thread_flag(TIF_IA32);
400 		set_thread_flag(TIF_X32);
401 		current->personality &= ~READ_IMPLIES_EXEC;
402 		/* is_compat_task() uses the presence of the x32
403 		   syscall bit flag to determine compat status */
404 		current_thread_info()->status &= ~TS_COMPAT;
405 	} else {
406 		set_thread_flag(TIF_IA32);
407 		clear_thread_flag(TIF_X32);
408 		current->personality |= force_personality32;
409 		/* Prepare the first "return" to user space */
410 		current_thread_info()->status |= TS_COMPAT;
411 	}
412 }
413 EXPORT_SYMBOL_GPL(set_personality_ia32);
414 
415 unsigned long get_wchan(struct task_struct *p)
416 {
417 	unsigned long stack;
418 	u64 fp, ip;
419 	int count = 0;
420 
421 	if (!p || p == current || p->state == TASK_RUNNING)
422 		return 0;
423 	stack = (unsigned long)task_stack_page(p);
424 	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
425 		return 0;
426 	fp = *(u64 *)(p->thread.sp);
427 	do {
428 		if (fp < (unsigned long)stack ||
429 		    fp >= (unsigned long)stack+THREAD_SIZE)
430 			return 0;
431 		ip = *(u64 *)(fp+8);
432 		if (!in_sched_functions(ip))
433 			return ip;
434 		fp = *(u64 *)fp;
435 	} while (count++ < 16);
436 	return 0;
437 }
438 
439 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
440 {
441 	int ret = 0;
442 	int doit = task == current;
443 	int cpu;
444 
445 	switch (code) {
446 	case ARCH_SET_GS:
447 		if (addr >= TASK_SIZE_OF(task))
448 			return -EPERM;
449 		cpu = get_cpu();
450 		/* handle small bases via the GDT because that's faster to
451 		   switch. */
452 		if (addr <= 0xffffffff) {
453 			set_32bit_tls(task, GS_TLS, addr);
454 			if (doit) {
455 				load_TLS(&task->thread, cpu);
456 				load_gs_index(GS_TLS_SEL);
457 			}
458 			task->thread.gsindex = GS_TLS_SEL;
459 			task->thread.gs = 0;
460 		} else {
461 			task->thread.gsindex = 0;
462 			task->thread.gs = addr;
463 			if (doit) {
464 				load_gs_index(0);
465 				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
466 			}
467 		}
468 		put_cpu();
469 		break;
470 	case ARCH_SET_FS:
471 		/* Not strictly needed for fs, but do it for symmetry
472 		   with gs */
473 		if (addr >= TASK_SIZE_OF(task))
474 			return -EPERM;
475 		cpu = get_cpu();
476 		/* handle small bases via the GDT because that's faster to
477 		   switch. */
478 		if (addr <= 0xffffffff) {
479 			set_32bit_tls(task, FS_TLS, addr);
480 			if (doit) {
481 				load_TLS(&task->thread, cpu);
482 				loadsegment(fs, FS_TLS_SEL);
483 			}
484 			task->thread.fsindex = FS_TLS_SEL;
485 			task->thread.fs = 0;
486 		} else {
487 			task->thread.fsindex = 0;
488 			task->thread.fs = addr;
489 			if (doit) {
490 				/* set the selector to 0 to not confuse
491 				   __switch_to */
492 				loadsegment(fs, 0);
493 				ret = wrmsrl_safe(MSR_FS_BASE, addr);
494 			}
495 		}
496 		put_cpu();
497 		break;
498 	case ARCH_GET_FS: {
499 		unsigned long base;
500 		if (task->thread.fsindex == FS_TLS_SEL)
501 			base = read_32bit_tls(task, FS_TLS);
502 		else if (doit)
503 			rdmsrl(MSR_FS_BASE, base);
504 		else
505 			base = task->thread.fs;
506 		ret = put_user(base, (unsigned long __user *)addr);
507 		break;
508 	}
509 	case ARCH_GET_GS: {
510 		unsigned long base;
511 		unsigned gsindex;
512 		if (task->thread.gsindex == GS_TLS_SEL)
513 			base = read_32bit_tls(task, GS_TLS);
514 		else if (doit) {
515 			savesegment(gs, gsindex);
516 			if (gsindex)
517 				rdmsrl(MSR_KERNEL_GS_BASE, base);
518 			else
519 				base = task->thread.gs;
520 		} else
521 			base = task->thread.gs;
522 		ret = put_user(base, (unsigned long __user *)addr);
523 		break;
524 	}
525 
526 	default:
527 		ret = -EINVAL;
528 		break;
529 	}
530 
531 	return ret;
532 }
533 
534 long sys_arch_prctl(int code, unsigned long addr)
535 {
536 	return do_arch_prctl(current, code, addr);
537 }
538 
539 unsigned long KSTK_ESP(struct task_struct *task)
540 {
541 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
542 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
543 }
544