xref: /linux/arch/x86/kernel/process_64.c (revision a0f97e06a43cf524e616f09e6af3398e1e9c1c5b)
1 /*
2  *  linux/arch/x86-64/kernel/process.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *
6  *  Pentium III FXSR, SSE support
7  *	Gareth Hughes <gareth@valinux.com>, May 2000
8  *
9  *  X86-64 port
10  *	Andi Kleen.
11  *
12  *	CPU hotplug support - ashok.raj@intel.com
13  */
14 
15 /*
16  * This file handles the architecture-dependent parts of process handling..
17  */
18 
19 #include <stdarg.h>
20 
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/fs.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/module.h>
32 #include <linux/a.out.h>
33 #include <linux/interrupt.h>
34 #include <linux/delay.h>
35 #include <linux/ptrace.h>
36 #include <linux/utsname.h>
37 #include <linux/random.h>
38 #include <linux/notifier.h>
39 #include <linux/kprobes.h>
40 #include <linux/kdebug.h>
41 
42 #include <asm/uaccess.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/io.h>
46 #include <asm/processor.h>
47 #include <asm/i387.h>
48 #include <asm/mmu_context.h>
49 #include <asm/pda.h>
50 #include <asm/prctl.h>
51 #include <asm/desc.h>
52 #include <asm/proto.h>
53 #include <asm/ia32.h>
54 #include <asm/idle.h>
55 
56 asmlinkage extern void ret_from_fork(void);
57 
58 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 
60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override);
62 
63 /*
64  * Powermanagement idle function, if any..
65  */
66 void (*pm_idle)(void);
67 EXPORT_SYMBOL(pm_idle);
68 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 
70 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 
72 void idle_notifier_register(struct notifier_block *n)
73 {
74 	atomic_notifier_chain_register(&idle_notifier, n);
75 }
76 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 
78 void idle_notifier_unregister(struct notifier_block *n)
79 {
80 	atomic_notifier_chain_unregister(&idle_notifier, n);
81 }
82 EXPORT_SYMBOL(idle_notifier_unregister);
83 
84 void enter_idle(void)
85 {
86 	write_pda(isidle, 1);
87 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
88 }
89 
90 static void __exit_idle(void)
91 {
92 	if (test_and_clear_bit_pda(0, isidle) == 0)
93 		return;
94 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95 }
96 
97 /* Called from interrupts to signify idle end */
98 void exit_idle(void)
99 {
100 	/* idle loop has pid 0 */
101 	if (current->pid)
102 		return;
103 	__exit_idle();
104 }
105 
106 /*
107  * We use this if we don't have any better
108  * idle routine..
109  */
110 static void default_idle(void)
111 {
112 	current_thread_info()->status &= ~TS_POLLING;
113 	/*
114 	 * TS_POLLING-cleared state must be visible before we
115 	 * test NEED_RESCHED:
116 	 */
117 	smp_mb();
118 	local_irq_disable();
119 	if (!need_resched()) {
120 		/* Enables interrupts one instruction before HLT.
121 		   x86 special cases this so there is no race. */
122 		safe_halt();
123 	} else
124 		local_irq_enable();
125 	current_thread_info()->status |= TS_POLLING;
126 }
127 
128 /*
129  * On SMP it's slightly faster (but much more power-consuming!)
130  * to poll the ->need_resched flag instead of waiting for the
131  * cross-CPU IPI to arrive. Use this option with caution.
132  */
133 static void poll_idle (void)
134 {
135 	local_irq_enable();
136 	cpu_relax();
137 }
138 
139 void cpu_idle_wait(void)
140 {
141 	unsigned int cpu, this_cpu = get_cpu();
142 	cpumask_t map, tmp = current->cpus_allowed;
143 
144 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
145 	put_cpu();
146 
147 	cpus_clear(map);
148 	for_each_online_cpu(cpu) {
149 		per_cpu(cpu_idle_state, cpu) = 1;
150 		cpu_set(cpu, map);
151 	}
152 
153 	__get_cpu_var(cpu_idle_state) = 0;
154 
155 	wmb();
156 	do {
157 		ssleep(1);
158 		for_each_online_cpu(cpu) {
159 			if (cpu_isset(cpu, map) &&
160 					!per_cpu(cpu_idle_state, cpu))
161 				cpu_clear(cpu, map);
162 		}
163 		cpus_and(map, map, cpu_online_map);
164 	} while (!cpus_empty(map));
165 
166 	set_cpus_allowed(current, tmp);
167 }
168 EXPORT_SYMBOL_GPL(cpu_idle_wait);
169 
170 #ifdef CONFIG_HOTPLUG_CPU
171 DECLARE_PER_CPU(int, cpu_state);
172 
173 #include <asm/nmi.h>
174 /* We halt the CPU with physical CPU hotplug */
175 static inline void play_dead(void)
176 {
177 	idle_task_exit();
178 	wbinvd();
179 	mb();
180 	/* Ack it */
181 	__get_cpu_var(cpu_state) = CPU_DEAD;
182 
183 	local_irq_disable();
184 	while (1)
185 		halt();
186 }
187 #else
188 static inline void play_dead(void)
189 {
190 	BUG();
191 }
192 #endif /* CONFIG_HOTPLUG_CPU */
193 
194 /*
195  * The idle thread. There's no useful work to be
196  * done, so just try to conserve power and have a
197  * low exit latency (ie sit in a loop waiting for
198  * somebody to say that they'd like to reschedule)
199  */
200 void cpu_idle (void)
201 {
202 	current_thread_info()->status |= TS_POLLING;
203 	/* endless idle loop with no priority at all */
204 	while (1) {
205 		while (!need_resched()) {
206 			void (*idle)(void);
207 
208 			if (__get_cpu_var(cpu_idle_state))
209 				__get_cpu_var(cpu_idle_state) = 0;
210 
211 			rmb();
212 			idle = pm_idle;
213 			if (!idle)
214 				idle = default_idle;
215 			if (cpu_is_offline(smp_processor_id()))
216 				play_dead();
217 			/*
218 			 * Idle routines should keep interrupts disabled
219 			 * from here on, until they go to idle.
220 			 * Otherwise, idle callbacks can misfire.
221 			 */
222 			local_irq_disable();
223 			enter_idle();
224 			idle();
225 			/* In many cases the interrupt that ended idle
226 			   has already called exit_idle. But some idle
227 			   loops can be woken up without interrupt. */
228 			__exit_idle();
229 		}
230 
231 		preempt_enable_no_resched();
232 		schedule();
233 		preempt_disable();
234 	}
235 }
236 
237 /*
238  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
239  * which can obviate IPI to trigger checking of need_resched.
240  * We execute MONITOR against need_resched and enter optimized wait state
241  * through MWAIT. Whenever someone changes need_resched, we would be woken
242  * up from MWAIT (without an IPI).
243  *
244  * New with Core Duo processors, MWAIT can take some hints based on CPU
245  * capability.
246  */
247 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
248 {
249 	if (!need_resched()) {
250 		__monitor((void *)&current_thread_info()->flags, 0, 0);
251 		smp_mb();
252 		if (!need_resched())
253 			__mwait(eax, ecx);
254 	}
255 }
256 
257 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
258 static void mwait_idle(void)
259 {
260 	if (!need_resched()) {
261 		__monitor((void *)&current_thread_info()->flags, 0, 0);
262 		smp_mb();
263 		if (!need_resched())
264 			__sti_mwait(0, 0);
265 		else
266 			local_irq_enable();
267 	} else {
268 		local_irq_enable();
269 	}
270 }
271 
272 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
273 {
274 	static int printed;
275 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
276 		/*
277 		 * Skip, if setup has overridden idle.
278 		 * One CPU supports mwait => All CPUs supports mwait
279 		 */
280 		if (!pm_idle) {
281 			if (!printed) {
282 				printk(KERN_INFO "using mwait in idle threads.\n");
283 				printed = 1;
284 			}
285 			pm_idle = mwait_idle;
286 		}
287 	}
288 }
289 
290 static int __init idle_setup (char *str)
291 {
292 	if (!strcmp(str, "poll")) {
293 		printk("using polling idle threads.\n");
294 		pm_idle = poll_idle;
295 	} else if (!strcmp(str, "mwait"))
296 		force_mwait = 1;
297 	else
298 		return -1;
299 
300 	boot_option_idle_override = 1;
301 	return 0;
302 }
303 early_param("idle", idle_setup);
304 
305 /* Prints also some state that isn't saved in the pt_regs */
306 void __show_regs(struct pt_regs * regs)
307 {
308 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
309 	unsigned long d0, d1, d2, d3, d6, d7;
310 	unsigned int fsindex,gsindex;
311 	unsigned int ds,cs,es;
312 
313 	printk("\n");
314 	print_modules();
315 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
316 		current->pid, current->comm, print_tainted(),
317 		init_utsname()->release,
318 		(int)strcspn(init_utsname()->version, " "),
319 		init_utsname()->version);
320 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
321 	printk_address(regs->rip);
322 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
323 		regs->eflags);
324 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
325 	       regs->rax, regs->rbx, regs->rcx);
326 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
327 	       regs->rdx, regs->rsi, regs->rdi);
328 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
329 	       regs->rbp, regs->r8, regs->r9);
330 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
331 	       regs->r10, regs->r11, regs->r12);
332 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
333 	       regs->r13, regs->r14, regs->r15);
334 
335 	asm("movl %%ds,%0" : "=r" (ds));
336 	asm("movl %%cs,%0" : "=r" (cs));
337 	asm("movl %%es,%0" : "=r" (es));
338 	asm("movl %%fs,%0" : "=r" (fsindex));
339 	asm("movl %%gs,%0" : "=r" (gsindex));
340 
341 	rdmsrl(MSR_FS_BASE, fs);
342 	rdmsrl(MSR_GS_BASE, gs);
343 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
344 
345 	cr0 = read_cr0();
346 	cr2 = read_cr2();
347 	cr3 = read_cr3();
348 	cr4 = read_cr4();
349 
350 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
351 	       fs,fsindex,gs,gsindex,shadowgs);
352 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
353 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
354 
355 	get_debugreg(d0, 0);
356 	get_debugreg(d1, 1);
357 	get_debugreg(d2, 2);
358 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
359 	get_debugreg(d3, 3);
360 	get_debugreg(d6, 6);
361 	get_debugreg(d7, 7);
362 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
363 }
364 
365 void show_regs(struct pt_regs *regs)
366 {
367 	printk("CPU %d:", smp_processor_id());
368 	__show_regs(regs);
369 	show_trace(NULL, regs, (void *)(regs + 1));
370 }
371 
372 /*
373  * Free current thread data structures etc..
374  */
375 void exit_thread(void)
376 {
377 	struct task_struct *me = current;
378 	struct thread_struct *t = &me->thread;
379 
380 	if (me->thread.io_bitmap_ptr) {
381 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
382 
383 		kfree(t->io_bitmap_ptr);
384 		t->io_bitmap_ptr = NULL;
385 		clear_thread_flag(TIF_IO_BITMAP);
386 		/*
387 		 * Careful, clear this in the TSS too:
388 		 */
389 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
390 		t->io_bitmap_max = 0;
391 		put_cpu();
392 	}
393 }
394 
395 void flush_thread(void)
396 {
397 	struct task_struct *tsk = current;
398 
399 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
400 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
401 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
402 			clear_tsk_thread_flag(tsk, TIF_IA32);
403 		} else {
404 			set_tsk_thread_flag(tsk, TIF_IA32);
405 			current_thread_info()->status |= TS_COMPAT;
406 		}
407 	}
408 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
409 
410 	tsk->thread.debugreg0 = 0;
411 	tsk->thread.debugreg1 = 0;
412 	tsk->thread.debugreg2 = 0;
413 	tsk->thread.debugreg3 = 0;
414 	tsk->thread.debugreg6 = 0;
415 	tsk->thread.debugreg7 = 0;
416 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
417 	/*
418 	 * Forget coprocessor state..
419 	 */
420 	clear_fpu(tsk);
421 	clear_used_math();
422 }
423 
424 void release_thread(struct task_struct *dead_task)
425 {
426 	if (dead_task->mm) {
427 		if (dead_task->mm->context.size) {
428 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
429 					dead_task->comm,
430 					dead_task->mm->context.ldt,
431 					dead_task->mm->context.size);
432 			BUG();
433 		}
434 	}
435 }
436 
437 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
438 {
439 	struct user_desc ud = {
440 		.base_addr = addr,
441 		.limit = 0xfffff,
442 		.seg_32bit = 1,
443 		.limit_in_pages = 1,
444 		.useable = 1,
445 	};
446 	struct n_desc_struct *desc = (void *)t->thread.tls_array;
447 	desc += tls;
448 	desc->a = LDT_entry_a(&ud);
449 	desc->b = LDT_entry_b(&ud);
450 }
451 
452 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
453 {
454 	struct desc_struct *desc = (void *)t->thread.tls_array;
455 	desc += tls;
456 	return desc->base0 |
457 		(((u32)desc->base1) << 16) |
458 		(((u32)desc->base2) << 24);
459 }
460 
461 /*
462  * This gets called before we allocate a new thread and copy
463  * the current task into it.
464  */
465 void prepare_to_copy(struct task_struct *tsk)
466 {
467 	unlazy_fpu(tsk);
468 }
469 
470 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
471 		unsigned long unused,
472 	struct task_struct * p, struct pt_regs * regs)
473 {
474 	int err;
475 	struct pt_regs * childregs;
476 	struct task_struct *me = current;
477 
478 	childregs = ((struct pt_regs *)
479 			(THREAD_SIZE + task_stack_page(p))) - 1;
480 	*childregs = *regs;
481 
482 	childregs->rax = 0;
483 	childregs->rsp = rsp;
484 	if (rsp == ~0UL)
485 		childregs->rsp = (unsigned long)childregs;
486 
487 	p->thread.rsp = (unsigned long) childregs;
488 	p->thread.rsp0 = (unsigned long) (childregs+1);
489 	p->thread.userrsp = me->thread.userrsp;
490 
491 	set_tsk_thread_flag(p, TIF_FORK);
492 
493 	p->thread.fs = me->thread.fs;
494 	p->thread.gs = me->thread.gs;
495 
496 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
497 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
498 	asm("mov %%es,%0" : "=m" (p->thread.es));
499 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
500 
501 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
502 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
503 		if (!p->thread.io_bitmap_ptr) {
504 			p->thread.io_bitmap_max = 0;
505 			return -ENOMEM;
506 		}
507 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
508 				IO_BITMAP_BYTES);
509 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
510 	}
511 
512 	/*
513 	 * Set a new TLS for the child thread?
514 	 */
515 	if (clone_flags & CLONE_SETTLS) {
516 #ifdef CONFIG_IA32_EMULATION
517 		if (test_thread_flag(TIF_IA32))
518 			err = ia32_child_tls(p, childregs);
519 		else
520 #endif
521 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
522 		if (err)
523 			goto out;
524 	}
525 	err = 0;
526 out:
527 	if (err && p->thread.io_bitmap_ptr) {
528 		kfree(p->thread.io_bitmap_ptr);
529 		p->thread.io_bitmap_max = 0;
530 	}
531 	return err;
532 }
533 
534 /*
535  * This special macro can be used to load a debugging register
536  */
537 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
538 
539 static inline void __switch_to_xtra(struct task_struct *prev_p,
540 			     	    struct task_struct *next_p,
541 			     	    struct tss_struct *tss)
542 {
543 	struct thread_struct *prev, *next;
544 
545 	prev = &prev_p->thread,
546 	next = &next_p->thread;
547 
548 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
549 		loaddebug(next, 0);
550 		loaddebug(next, 1);
551 		loaddebug(next, 2);
552 		loaddebug(next, 3);
553 		/* no 4 and 5 */
554 		loaddebug(next, 6);
555 		loaddebug(next, 7);
556 	}
557 
558 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
559 		/*
560 		 * Copy the relevant range of the IO bitmap.
561 		 * Normally this is 128 bytes or less:
562 		 */
563 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
564 		       max(prev->io_bitmap_max, next->io_bitmap_max));
565 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
566 		/*
567 		 * Clear any possible leftover bits:
568 		 */
569 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
570 	}
571 }
572 
573 /*
574  *	switch_to(x,y) should switch tasks from x to y.
575  *
576  * This could still be optimized:
577  * - fold all the options into a flag word and test it with a single test.
578  * - could test fs/gs bitsliced
579  *
580  * Kprobes not supported here. Set the probe on schedule instead.
581  */
582 __kprobes struct task_struct *
583 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584 {
585 	struct thread_struct *prev = &prev_p->thread,
586 				 *next = &next_p->thread;
587 	int cpu = smp_processor_id();
588 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
589 
590 	/* we're going to use this soon, after a few expensive things */
591 	if (next_p->fpu_counter>5)
592 		prefetch(&next->i387.fxsave);
593 
594 	/*
595 	 * Reload esp0, LDT and the page table pointer:
596 	 */
597 	tss->rsp0 = next->rsp0;
598 
599 	/*
600 	 * Switch DS and ES.
601 	 * This won't pick up thread selector changes, but I guess that is ok.
602 	 */
603 	asm volatile("mov %%es,%0" : "=m" (prev->es));
604 	if (unlikely(next->es | prev->es))
605 		loadsegment(es, next->es);
606 
607 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
608 	if (unlikely(next->ds | prev->ds))
609 		loadsegment(ds, next->ds);
610 
611 	load_TLS(next, cpu);
612 
613 	/*
614 	 * Switch FS and GS.
615 	 */
616 	{
617 		unsigned fsindex;
618 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
619 		/* segment register != 0 always requires a reload.
620 		   also reload when it has changed.
621 		   when prev process used 64bit base always reload
622 		   to avoid an information leak. */
623 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
624 			loadsegment(fs, next->fsindex);
625 			/* check if the user used a selector != 0
626 	                 * if yes clear 64bit base, since overloaded base
627                          * is always mapped to the Null selector
628                          */
629 			if (fsindex)
630 			prev->fs = 0;
631 		}
632 		/* when next process has a 64bit base use it */
633 		if (next->fs)
634 			wrmsrl(MSR_FS_BASE, next->fs);
635 		prev->fsindex = fsindex;
636 	}
637 	{
638 		unsigned gsindex;
639 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
640 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
641 			load_gs_index(next->gsindex);
642 			if (gsindex)
643 			prev->gs = 0;
644 		}
645 		if (next->gs)
646 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
647 		prev->gsindex = gsindex;
648 	}
649 
650 	/* Must be after DS reload */
651 	unlazy_fpu(prev_p);
652 
653 	/*
654 	 * Switch the PDA and FPU contexts.
655 	 */
656 	prev->userrsp = read_pda(oldrsp);
657 	write_pda(oldrsp, next->userrsp);
658 	write_pda(pcurrent, next_p);
659 
660 	write_pda(kernelstack,
661 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
662 #ifdef CONFIG_CC_STACKPROTECTOR
663 	write_pda(stack_canary, next_p->stack_canary);
664 	/*
665 	 * Build time only check to make sure the stack_canary is at
666 	 * offset 40 in the pda; this is a gcc ABI requirement
667 	 */
668 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
669 #endif
670 
671 	/*
672 	 * Now maybe reload the debug registers and handle I/O bitmaps
673 	 */
674 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
675 	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
676 		__switch_to_xtra(prev_p, next_p, tss);
677 
678 	/* If the task has used fpu the last 5 timeslices, just do a full
679 	 * restore of the math state immediately to avoid the trap; the
680 	 * chances of needing FPU soon are obviously high now
681 	 */
682 	if (next_p->fpu_counter>5)
683 		math_state_restore();
684 	return prev_p;
685 }
686 
687 /*
688  * sys_execve() executes a new program.
689  */
690 asmlinkage
691 long sys_execve(char __user *name, char __user * __user *argv,
692 		char __user * __user *envp, struct pt_regs regs)
693 {
694 	long error;
695 	char * filename;
696 
697 	filename = getname(name);
698 	error = PTR_ERR(filename);
699 	if (IS_ERR(filename))
700 		return error;
701 	error = do_execve(filename, argv, envp, &regs);
702 	if (error == 0) {
703 		task_lock(current);
704 		current->ptrace &= ~PT_DTRACE;
705 		task_unlock(current);
706 	}
707 	putname(filename);
708 	return error;
709 }
710 
711 void set_personality_64bit(void)
712 {
713 	/* inherit personality from parent */
714 
715 	/* Make sure to be in 64bit mode */
716 	clear_thread_flag(TIF_IA32);
717 
718 	/* TBD: overwrites user setup. Should have two bits.
719 	   But 64bit processes have always behaved this way,
720 	   so it's not too bad. The main problem is just that
721    	   32bit childs are affected again. */
722 	current->personality &= ~READ_IMPLIES_EXEC;
723 }
724 
725 asmlinkage long sys_fork(struct pt_regs *regs)
726 {
727 	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
728 }
729 
730 asmlinkage long
731 sys_clone(unsigned long clone_flags, unsigned long newsp,
732 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
733 {
734 	if (!newsp)
735 		newsp = regs->rsp;
736 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
737 }
738 
739 /*
740  * This is trivial, and on the face of it looks like it
741  * could equally well be done in user mode.
742  *
743  * Not so, for quite unobvious reasons - register pressure.
744  * In user mode vfork() cannot have a stack frame, and if
745  * done by calling the "clone()" system call directly, you
746  * do not have enough call-clobbered registers to hold all
747  * the information you need.
748  */
749 asmlinkage long sys_vfork(struct pt_regs *regs)
750 {
751 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
752 		    NULL, NULL);
753 }
754 
755 unsigned long get_wchan(struct task_struct *p)
756 {
757 	unsigned long stack;
758 	u64 fp,rip;
759 	int count = 0;
760 
761 	if (!p || p == current || p->state==TASK_RUNNING)
762 		return 0;
763 	stack = (unsigned long)task_stack_page(p);
764 	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
765 		return 0;
766 	fp = *(u64 *)(p->thread.rsp);
767 	do {
768 		if (fp < (unsigned long)stack ||
769 		    fp > (unsigned long)stack+THREAD_SIZE)
770 			return 0;
771 		rip = *(u64 *)(fp+8);
772 		if (!in_sched_functions(rip))
773 			return rip;
774 		fp = *(u64 *)fp;
775 	} while (count++ < 16);
776 	return 0;
777 }
778 
779 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
780 {
781 	int ret = 0;
782 	int doit = task == current;
783 	int cpu;
784 
785 	switch (code) {
786 	case ARCH_SET_GS:
787 		if (addr >= TASK_SIZE_OF(task))
788 			return -EPERM;
789 		cpu = get_cpu();
790 		/* handle small bases via the GDT because that's faster to
791 		   switch. */
792 		if (addr <= 0xffffffff) {
793 			set_32bit_tls(task, GS_TLS, addr);
794 			if (doit) {
795 				load_TLS(&task->thread, cpu);
796 				load_gs_index(GS_TLS_SEL);
797 			}
798 			task->thread.gsindex = GS_TLS_SEL;
799 			task->thread.gs = 0;
800 		} else {
801 			task->thread.gsindex = 0;
802 			task->thread.gs = addr;
803 			if (doit) {
804 				load_gs_index(0);
805 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
806 			}
807 		}
808 		put_cpu();
809 		break;
810 	case ARCH_SET_FS:
811 		/* Not strictly needed for fs, but do it for symmetry
812 		   with gs */
813 		if (addr >= TASK_SIZE_OF(task))
814 			return -EPERM;
815 		cpu = get_cpu();
816 		/* handle small bases via the GDT because that's faster to
817 		   switch. */
818 		if (addr <= 0xffffffff) {
819 			set_32bit_tls(task, FS_TLS, addr);
820 			if (doit) {
821 				load_TLS(&task->thread, cpu);
822 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
823 			}
824 			task->thread.fsindex = FS_TLS_SEL;
825 			task->thread.fs = 0;
826 		} else {
827 			task->thread.fsindex = 0;
828 			task->thread.fs = addr;
829 			if (doit) {
830 				/* set the selector to 0 to not confuse
831 				   __switch_to */
832 				asm volatile("movl %0,%%fs" :: "r" (0));
833 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
834 			}
835 		}
836 		put_cpu();
837 		break;
838 	case ARCH_GET_FS: {
839 		unsigned long base;
840 		if (task->thread.fsindex == FS_TLS_SEL)
841 			base = read_32bit_tls(task, FS_TLS);
842 		else if (doit)
843 			rdmsrl(MSR_FS_BASE, base);
844 		else
845 			base = task->thread.fs;
846 		ret = put_user(base, (unsigned long __user *)addr);
847 		break;
848 	}
849 	case ARCH_GET_GS: {
850 		unsigned long base;
851 		unsigned gsindex;
852 		if (task->thread.gsindex == GS_TLS_SEL)
853 			base = read_32bit_tls(task, GS_TLS);
854 		else if (doit) {
855  			asm("movl %%gs,%0" : "=r" (gsindex));
856 			if (gsindex)
857 				rdmsrl(MSR_KERNEL_GS_BASE, base);
858 			else
859 				base = task->thread.gs;
860 		}
861 		else
862 			base = task->thread.gs;
863 		ret = put_user(base, (unsigned long __user *)addr);
864 		break;
865 	}
866 
867 	default:
868 		ret = -EINVAL;
869 		break;
870 	}
871 
872 	return ret;
873 }
874 
875 long sys_arch_prctl(int code, unsigned long addr)
876 {
877 	return do_arch_prctl(current, code, addr);
878 }
879 
880 /*
881  * Capture the user space registers if the task is not running (in user space)
882  */
883 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
884 {
885 	struct pt_regs *pp, ptregs;
886 
887 	pp = task_pt_regs(tsk);
888 
889 	ptregs = *pp;
890 	ptregs.cs &= 0xffff;
891 	ptregs.ss &= 0xffff;
892 
893 	elf_core_copy_regs(regs, &ptregs);
894 
895 	return 1;
896 }
897 
898 unsigned long arch_align_stack(unsigned long sp)
899 {
900 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
901 		sp -= get_random_int() % 8192;
902 	return sp & ~0xf;
903 }
904