xref: /linux/arch/x86/kernel/process_64.c (revision 2b8232ce512105e28453f301d1510de8363bccd1)
1 /*
2  *  linux/arch/x86-64/kernel/process.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *
6  *  Pentium III FXSR, SSE support
7  *	Gareth Hughes <gareth@valinux.com>, May 2000
8  *
9  *  X86-64 port
10  *	Andi Kleen.
11  *
12  *	CPU hotplug support - ashok.raj@intel.com
13  */
14 
15 /*
16  * This file handles the architecture-dependent parts of process handling..
17  */
18 
19 #include <stdarg.h>
20 
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/fs.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/module.h>
32 #include <linux/a.out.h>
33 #include <linux/interrupt.h>
34 #include <linux/delay.h>
35 #include <linux/ptrace.h>
36 #include <linux/utsname.h>
37 #include <linux/random.h>
38 #include <linux/notifier.h>
39 #include <linux/kprobes.h>
40 #include <linux/kdebug.h>
41 #include <linux/tick.h>
42 
43 #include <asm/uaccess.h>
44 #include <asm/pgtable.h>
45 #include <asm/system.h>
46 #include <asm/io.h>
47 #include <asm/processor.h>
48 #include <asm/i387.h>
49 #include <asm/mmu_context.h>
50 #include <asm/pda.h>
51 #include <asm/prctl.h>
52 #include <asm/desc.h>
53 #include <asm/proto.h>
54 #include <asm/ia32.h>
55 #include <asm/idle.h>
56 
57 asmlinkage extern void ret_from_fork(void);
58 
59 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
60 
61 unsigned long boot_option_idle_override = 0;
62 EXPORT_SYMBOL(boot_option_idle_override);
63 
64 /*
65  * Powermanagement idle function, if any..
66  */
67 void (*pm_idle)(void);
68 EXPORT_SYMBOL(pm_idle);
69 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
70 
71 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
72 
73 void idle_notifier_register(struct notifier_block *n)
74 {
75 	atomic_notifier_chain_register(&idle_notifier, n);
76 }
77 EXPORT_SYMBOL_GPL(idle_notifier_register);
78 
79 void idle_notifier_unregister(struct notifier_block *n)
80 {
81 	atomic_notifier_chain_unregister(&idle_notifier, n);
82 }
83 EXPORT_SYMBOL(idle_notifier_unregister);
84 
85 void enter_idle(void)
86 {
87 	write_pda(isidle, 1);
88 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 }
90 
91 static void __exit_idle(void)
92 {
93 	if (test_and_clear_bit_pda(0, isidle) == 0)
94 		return;
95 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96 }
97 
98 /* Called from interrupts to signify idle end */
99 void exit_idle(void)
100 {
101 	/* idle loop has pid 0 */
102 	if (current->pid)
103 		return;
104 	__exit_idle();
105 }
106 
107 /*
108  * We use this if we don't have any better
109  * idle routine..
110  */
111 static void default_idle(void)
112 {
113 	current_thread_info()->status &= ~TS_POLLING;
114 	/*
115 	 * TS_POLLING-cleared state must be visible before we
116 	 * test NEED_RESCHED:
117 	 */
118 	smp_mb();
119 	local_irq_disable();
120 	if (!need_resched()) {
121 		/* Enables interrupts one instruction before HLT.
122 		   x86 special cases this so there is no race. */
123 		safe_halt();
124 	} else
125 		local_irq_enable();
126 	current_thread_info()->status |= TS_POLLING;
127 }
128 
129 /*
130  * On SMP it's slightly faster (but much more power-consuming!)
131  * to poll the ->need_resched flag instead of waiting for the
132  * cross-CPU IPI to arrive. Use this option with caution.
133  */
134 static void poll_idle (void)
135 {
136 	local_irq_enable();
137 	cpu_relax();
138 }
139 
140 void cpu_idle_wait(void)
141 {
142 	unsigned int cpu, this_cpu = get_cpu();
143 	cpumask_t map, tmp = current->cpus_allowed;
144 
145 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
146 	put_cpu();
147 
148 	cpus_clear(map);
149 	for_each_online_cpu(cpu) {
150 		per_cpu(cpu_idle_state, cpu) = 1;
151 		cpu_set(cpu, map);
152 	}
153 
154 	__get_cpu_var(cpu_idle_state) = 0;
155 
156 	wmb();
157 	do {
158 		ssleep(1);
159 		for_each_online_cpu(cpu) {
160 			if (cpu_isset(cpu, map) &&
161 					!per_cpu(cpu_idle_state, cpu))
162 				cpu_clear(cpu, map);
163 		}
164 		cpus_and(map, map, cpu_online_map);
165 	} while (!cpus_empty(map));
166 
167 	set_cpus_allowed(current, tmp);
168 }
169 EXPORT_SYMBOL_GPL(cpu_idle_wait);
170 
171 #ifdef CONFIG_HOTPLUG_CPU
172 DECLARE_PER_CPU(int, cpu_state);
173 
174 #include <asm/nmi.h>
175 /* We halt the CPU with physical CPU hotplug */
176 static inline void play_dead(void)
177 {
178 	idle_task_exit();
179 	wbinvd();
180 	mb();
181 	/* Ack it */
182 	__get_cpu_var(cpu_state) = CPU_DEAD;
183 
184 	local_irq_disable();
185 	while (1)
186 		halt();
187 }
188 #else
189 static inline void play_dead(void)
190 {
191 	BUG();
192 }
193 #endif /* CONFIG_HOTPLUG_CPU */
194 
195 /*
196  * The idle thread. There's no useful work to be
197  * done, so just try to conserve power and have a
198  * low exit latency (ie sit in a loop waiting for
199  * somebody to say that they'd like to reschedule)
200  */
201 void cpu_idle (void)
202 {
203 	current_thread_info()->status |= TS_POLLING;
204 	/* endless idle loop with no priority at all */
205 	while (1) {
206 		while (!need_resched()) {
207 			void (*idle)(void);
208 
209 			if (__get_cpu_var(cpu_idle_state))
210 				__get_cpu_var(cpu_idle_state) = 0;
211 
212 			tick_nohz_stop_sched_tick();
213 
214 			rmb();
215 			idle = pm_idle;
216 			if (!idle)
217 				idle = default_idle;
218 			if (cpu_is_offline(smp_processor_id()))
219 				play_dead();
220 			/*
221 			 * Idle routines should keep interrupts disabled
222 			 * from here on, until they go to idle.
223 			 * Otherwise, idle callbacks can misfire.
224 			 */
225 			local_irq_disable();
226 			enter_idle();
227 			idle();
228 			/* In many cases the interrupt that ended idle
229 			   has already called exit_idle. But some idle
230 			   loops can be woken up without interrupt. */
231 			__exit_idle();
232 		}
233 
234 		tick_nohz_restart_sched_tick();
235 		preempt_enable_no_resched();
236 		schedule();
237 		preempt_disable();
238 	}
239 }
240 
241 /*
242  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
243  * which can obviate IPI to trigger checking of need_resched.
244  * We execute MONITOR against need_resched and enter optimized wait state
245  * through MWAIT. Whenever someone changes need_resched, we would be woken
246  * up from MWAIT (without an IPI).
247  *
248  * New with Core Duo processors, MWAIT can take some hints based on CPU
249  * capability.
250  */
251 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
252 {
253 	if (!need_resched()) {
254 		__monitor((void *)&current_thread_info()->flags, 0, 0);
255 		smp_mb();
256 		if (!need_resched())
257 			__mwait(eax, ecx);
258 	}
259 }
260 
261 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
262 static void mwait_idle(void)
263 {
264 	if (!need_resched()) {
265 		__monitor((void *)&current_thread_info()->flags, 0, 0);
266 		smp_mb();
267 		if (!need_resched())
268 			__sti_mwait(0, 0);
269 		else
270 			local_irq_enable();
271 	} else {
272 		local_irq_enable();
273 	}
274 }
275 
276 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
277 {
278 	static int printed;
279 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
280 		/*
281 		 * Skip, if setup has overridden idle.
282 		 * One CPU supports mwait => All CPUs supports mwait
283 		 */
284 		if (!pm_idle) {
285 			if (!printed) {
286 				printk(KERN_INFO "using mwait in idle threads.\n");
287 				printed = 1;
288 			}
289 			pm_idle = mwait_idle;
290 		}
291 	}
292 }
293 
294 static int __init idle_setup (char *str)
295 {
296 	if (!strcmp(str, "poll")) {
297 		printk("using polling idle threads.\n");
298 		pm_idle = poll_idle;
299 	} else if (!strcmp(str, "mwait"))
300 		force_mwait = 1;
301 	else
302 		return -1;
303 
304 	boot_option_idle_override = 1;
305 	return 0;
306 }
307 early_param("idle", idle_setup);
308 
309 /* Prints also some state that isn't saved in the pt_regs */
310 void __show_regs(struct pt_regs * regs)
311 {
312 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
313 	unsigned long d0, d1, d2, d3, d6, d7;
314 	unsigned int fsindex,gsindex;
315 	unsigned int ds,cs,es;
316 
317 	printk("\n");
318 	print_modules();
319 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
320 		current->pid, current->comm, print_tainted(),
321 		init_utsname()->release,
322 		(int)strcspn(init_utsname()->version, " "),
323 		init_utsname()->version);
324 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
325 	printk_address(regs->rip);
326 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
327 		regs->eflags);
328 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
329 	       regs->rax, regs->rbx, regs->rcx);
330 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
331 	       regs->rdx, regs->rsi, regs->rdi);
332 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
333 	       regs->rbp, regs->r8, regs->r9);
334 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
335 	       regs->r10, regs->r11, regs->r12);
336 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
337 	       regs->r13, regs->r14, regs->r15);
338 
339 	asm("movl %%ds,%0" : "=r" (ds));
340 	asm("movl %%cs,%0" : "=r" (cs));
341 	asm("movl %%es,%0" : "=r" (es));
342 	asm("movl %%fs,%0" : "=r" (fsindex));
343 	asm("movl %%gs,%0" : "=r" (gsindex));
344 
345 	rdmsrl(MSR_FS_BASE, fs);
346 	rdmsrl(MSR_GS_BASE, gs);
347 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
348 
349 	cr0 = read_cr0();
350 	cr2 = read_cr2();
351 	cr3 = read_cr3();
352 	cr4 = read_cr4();
353 
354 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
355 	       fs,fsindex,gs,gsindex,shadowgs);
356 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
357 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
358 
359 	get_debugreg(d0, 0);
360 	get_debugreg(d1, 1);
361 	get_debugreg(d2, 2);
362 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
363 	get_debugreg(d3, 3);
364 	get_debugreg(d6, 6);
365 	get_debugreg(d7, 7);
366 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
367 }
368 
369 void show_regs(struct pt_regs *regs)
370 {
371 	printk("CPU %d:", smp_processor_id());
372 	__show_regs(regs);
373 	show_trace(NULL, regs, (void *)(regs + 1));
374 }
375 
376 /*
377  * Free current thread data structures etc..
378  */
379 void exit_thread(void)
380 {
381 	struct task_struct *me = current;
382 	struct thread_struct *t = &me->thread;
383 
384 	if (me->thread.io_bitmap_ptr) {
385 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
386 
387 		kfree(t->io_bitmap_ptr);
388 		t->io_bitmap_ptr = NULL;
389 		clear_thread_flag(TIF_IO_BITMAP);
390 		/*
391 		 * Careful, clear this in the TSS too:
392 		 */
393 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
394 		t->io_bitmap_max = 0;
395 		put_cpu();
396 	}
397 }
398 
399 void flush_thread(void)
400 {
401 	struct task_struct *tsk = current;
402 
403 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
404 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
405 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
406 			clear_tsk_thread_flag(tsk, TIF_IA32);
407 		} else {
408 			set_tsk_thread_flag(tsk, TIF_IA32);
409 			current_thread_info()->status |= TS_COMPAT;
410 		}
411 	}
412 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
413 
414 	tsk->thread.debugreg0 = 0;
415 	tsk->thread.debugreg1 = 0;
416 	tsk->thread.debugreg2 = 0;
417 	tsk->thread.debugreg3 = 0;
418 	tsk->thread.debugreg6 = 0;
419 	tsk->thread.debugreg7 = 0;
420 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
421 	/*
422 	 * Forget coprocessor state..
423 	 */
424 	clear_fpu(tsk);
425 	clear_used_math();
426 }
427 
428 void release_thread(struct task_struct *dead_task)
429 {
430 	if (dead_task->mm) {
431 		if (dead_task->mm->context.size) {
432 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
433 					dead_task->comm,
434 					dead_task->mm->context.ldt,
435 					dead_task->mm->context.size);
436 			BUG();
437 		}
438 	}
439 }
440 
441 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
442 {
443 	struct user_desc ud = {
444 		.base_addr = addr,
445 		.limit = 0xfffff,
446 		.seg_32bit = 1,
447 		.limit_in_pages = 1,
448 		.useable = 1,
449 	};
450 	struct n_desc_struct *desc = (void *)t->thread.tls_array;
451 	desc += tls;
452 	desc->a = LDT_entry_a(&ud);
453 	desc->b = LDT_entry_b(&ud);
454 }
455 
456 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
457 {
458 	struct desc_struct *desc = (void *)t->thread.tls_array;
459 	desc += tls;
460 	return desc->base0 |
461 		(((u32)desc->base1) << 16) |
462 		(((u32)desc->base2) << 24);
463 }
464 
465 /*
466  * This gets called before we allocate a new thread and copy
467  * the current task into it.
468  */
469 void prepare_to_copy(struct task_struct *tsk)
470 {
471 	unlazy_fpu(tsk);
472 }
473 
474 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
475 		unsigned long unused,
476 	struct task_struct * p, struct pt_regs * regs)
477 {
478 	int err;
479 	struct pt_regs * childregs;
480 	struct task_struct *me = current;
481 
482 	childregs = ((struct pt_regs *)
483 			(THREAD_SIZE + task_stack_page(p))) - 1;
484 	*childregs = *regs;
485 
486 	childregs->rax = 0;
487 	childregs->rsp = rsp;
488 	if (rsp == ~0UL)
489 		childregs->rsp = (unsigned long)childregs;
490 
491 	p->thread.rsp = (unsigned long) childregs;
492 	p->thread.rsp0 = (unsigned long) (childregs+1);
493 	p->thread.userrsp = me->thread.userrsp;
494 
495 	set_tsk_thread_flag(p, TIF_FORK);
496 
497 	p->thread.fs = me->thread.fs;
498 	p->thread.gs = me->thread.gs;
499 
500 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
501 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
502 	asm("mov %%es,%0" : "=m" (p->thread.es));
503 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
504 
505 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
506 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
507 		if (!p->thread.io_bitmap_ptr) {
508 			p->thread.io_bitmap_max = 0;
509 			return -ENOMEM;
510 		}
511 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
512 				IO_BITMAP_BYTES);
513 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
514 	}
515 
516 	/*
517 	 * Set a new TLS for the child thread?
518 	 */
519 	if (clone_flags & CLONE_SETTLS) {
520 #ifdef CONFIG_IA32_EMULATION
521 		if (test_thread_flag(TIF_IA32))
522 			err = ia32_child_tls(p, childregs);
523 		else
524 #endif
525 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
526 		if (err)
527 			goto out;
528 	}
529 	err = 0;
530 out:
531 	if (err && p->thread.io_bitmap_ptr) {
532 		kfree(p->thread.io_bitmap_ptr);
533 		p->thread.io_bitmap_max = 0;
534 	}
535 	return err;
536 }
537 
538 /*
539  * This special macro can be used to load a debugging register
540  */
541 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
542 
543 static inline void __switch_to_xtra(struct task_struct *prev_p,
544 			     	    struct task_struct *next_p,
545 			     	    struct tss_struct *tss)
546 {
547 	struct thread_struct *prev, *next;
548 
549 	prev = &prev_p->thread,
550 	next = &next_p->thread;
551 
552 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
553 		loaddebug(next, 0);
554 		loaddebug(next, 1);
555 		loaddebug(next, 2);
556 		loaddebug(next, 3);
557 		/* no 4 and 5 */
558 		loaddebug(next, 6);
559 		loaddebug(next, 7);
560 	}
561 
562 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
563 		/*
564 		 * Copy the relevant range of the IO bitmap.
565 		 * Normally this is 128 bytes or less:
566 		 */
567 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
568 		       max(prev->io_bitmap_max, next->io_bitmap_max));
569 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
570 		/*
571 		 * Clear any possible leftover bits:
572 		 */
573 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
574 	}
575 }
576 
577 /*
578  *	switch_to(x,y) should switch tasks from x to y.
579  *
580  * This could still be optimized:
581  * - fold all the options into a flag word and test it with a single test.
582  * - could test fs/gs bitsliced
583  *
584  * Kprobes not supported here. Set the probe on schedule instead.
585  */
586 __kprobes struct task_struct *
587 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
588 {
589 	struct thread_struct *prev = &prev_p->thread,
590 				 *next = &next_p->thread;
591 	int cpu = smp_processor_id();
592 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
593 
594 	/* we're going to use this soon, after a few expensive things */
595 	if (next_p->fpu_counter>5)
596 		prefetch(&next->i387.fxsave);
597 
598 	/*
599 	 * Reload esp0, LDT and the page table pointer:
600 	 */
601 	tss->rsp0 = next->rsp0;
602 
603 	/*
604 	 * Switch DS and ES.
605 	 * This won't pick up thread selector changes, but I guess that is ok.
606 	 */
607 	asm volatile("mov %%es,%0" : "=m" (prev->es));
608 	if (unlikely(next->es | prev->es))
609 		loadsegment(es, next->es);
610 
611 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
612 	if (unlikely(next->ds | prev->ds))
613 		loadsegment(ds, next->ds);
614 
615 	load_TLS(next, cpu);
616 
617 	/*
618 	 * Switch FS and GS.
619 	 */
620 	{
621 		unsigned fsindex;
622 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
623 		/* segment register != 0 always requires a reload.
624 		   also reload when it has changed.
625 		   when prev process used 64bit base always reload
626 		   to avoid an information leak. */
627 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
628 			loadsegment(fs, next->fsindex);
629 			/* check if the user used a selector != 0
630 	                 * if yes clear 64bit base, since overloaded base
631                          * is always mapped to the Null selector
632                          */
633 			if (fsindex)
634 			prev->fs = 0;
635 		}
636 		/* when next process has a 64bit base use it */
637 		if (next->fs)
638 			wrmsrl(MSR_FS_BASE, next->fs);
639 		prev->fsindex = fsindex;
640 	}
641 	{
642 		unsigned gsindex;
643 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
644 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
645 			load_gs_index(next->gsindex);
646 			if (gsindex)
647 			prev->gs = 0;
648 		}
649 		if (next->gs)
650 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
651 		prev->gsindex = gsindex;
652 	}
653 
654 	/* Must be after DS reload */
655 	unlazy_fpu(prev_p);
656 
657 	/*
658 	 * Switch the PDA and FPU contexts.
659 	 */
660 	prev->userrsp = read_pda(oldrsp);
661 	write_pda(oldrsp, next->userrsp);
662 	write_pda(pcurrent, next_p);
663 
664 	write_pda(kernelstack,
665 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
666 #ifdef CONFIG_CC_STACKPROTECTOR
667 	write_pda(stack_canary, next_p->stack_canary);
668 	/*
669 	 * Build time only check to make sure the stack_canary is at
670 	 * offset 40 in the pda; this is a gcc ABI requirement
671 	 */
672 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
673 #endif
674 
675 	/*
676 	 * Now maybe reload the debug registers and handle I/O bitmaps
677 	 */
678 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
679 	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
680 		__switch_to_xtra(prev_p, next_p, tss);
681 
682 	/* If the task has used fpu the last 5 timeslices, just do a full
683 	 * restore of the math state immediately to avoid the trap; the
684 	 * chances of needing FPU soon are obviously high now
685 	 */
686 	if (next_p->fpu_counter>5)
687 		math_state_restore();
688 	return prev_p;
689 }
690 
691 /*
692  * sys_execve() executes a new program.
693  */
694 asmlinkage
695 long sys_execve(char __user *name, char __user * __user *argv,
696 		char __user * __user *envp, struct pt_regs regs)
697 {
698 	long error;
699 	char * filename;
700 
701 	filename = getname(name);
702 	error = PTR_ERR(filename);
703 	if (IS_ERR(filename))
704 		return error;
705 	error = do_execve(filename, argv, envp, &regs);
706 	if (error == 0) {
707 		task_lock(current);
708 		current->ptrace &= ~PT_DTRACE;
709 		task_unlock(current);
710 	}
711 	putname(filename);
712 	return error;
713 }
714 
715 void set_personality_64bit(void)
716 {
717 	/* inherit personality from parent */
718 
719 	/* Make sure to be in 64bit mode */
720 	clear_thread_flag(TIF_IA32);
721 
722 	/* TBD: overwrites user setup. Should have two bits.
723 	   But 64bit processes have always behaved this way,
724 	   so it's not too bad. The main problem is just that
725    	   32bit childs are affected again. */
726 	current->personality &= ~READ_IMPLIES_EXEC;
727 }
728 
729 asmlinkage long sys_fork(struct pt_regs *regs)
730 {
731 	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
732 }
733 
734 asmlinkage long
735 sys_clone(unsigned long clone_flags, unsigned long newsp,
736 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
737 {
738 	if (!newsp)
739 		newsp = regs->rsp;
740 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
741 }
742 
743 /*
744  * This is trivial, and on the face of it looks like it
745  * could equally well be done in user mode.
746  *
747  * Not so, for quite unobvious reasons - register pressure.
748  * In user mode vfork() cannot have a stack frame, and if
749  * done by calling the "clone()" system call directly, you
750  * do not have enough call-clobbered registers to hold all
751  * the information you need.
752  */
753 asmlinkage long sys_vfork(struct pt_regs *regs)
754 {
755 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
756 		    NULL, NULL);
757 }
758 
759 unsigned long get_wchan(struct task_struct *p)
760 {
761 	unsigned long stack;
762 	u64 fp,rip;
763 	int count = 0;
764 
765 	if (!p || p == current || p->state==TASK_RUNNING)
766 		return 0;
767 	stack = (unsigned long)task_stack_page(p);
768 	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
769 		return 0;
770 	fp = *(u64 *)(p->thread.rsp);
771 	do {
772 		if (fp < (unsigned long)stack ||
773 		    fp > (unsigned long)stack+THREAD_SIZE)
774 			return 0;
775 		rip = *(u64 *)(fp+8);
776 		if (!in_sched_functions(rip))
777 			return rip;
778 		fp = *(u64 *)fp;
779 	} while (count++ < 16);
780 	return 0;
781 }
782 
783 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
784 {
785 	int ret = 0;
786 	int doit = task == current;
787 	int cpu;
788 
789 	switch (code) {
790 	case ARCH_SET_GS:
791 		if (addr >= TASK_SIZE_OF(task))
792 			return -EPERM;
793 		cpu = get_cpu();
794 		/* handle small bases via the GDT because that's faster to
795 		   switch. */
796 		if (addr <= 0xffffffff) {
797 			set_32bit_tls(task, GS_TLS, addr);
798 			if (doit) {
799 				load_TLS(&task->thread, cpu);
800 				load_gs_index(GS_TLS_SEL);
801 			}
802 			task->thread.gsindex = GS_TLS_SEL;
803 			task->thread.gs = 0;
804 		} else {
805 			task->thread.gsindex = 0;
806 			task->thread.gs = addr;
807 			if (doit) {
808 				load_gs_index(0);
809 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
810 			}
811 		}
812 		put_cpu();
813 		break;
814 	case ARCH_SET_FS:
815 		/* Not strictly needed for fs, but do it for symmetry
816 		   with gs */
817 		if (addr >= TASK_SIZE_OF(task))
818 			return -EPERM;
819 		cpu = get_cpu();
820 		/* handle small bases via the GDT because that's faster to
821 		   switch. */
822 		if (addr <= 0xffffffff) {
823 			set_32bit_tls(task, FS_TLS, addr);
824 			if (doit) {
825 				load_TLS(&task->thread, cpu);
826 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
827 			}
828 			task->thread.fsindex = FS_TLS_SEL;
829 			task->thread.fs = 0;
830 		} else {
831 			task->thread.fsindex = 0;
832 			task->thread.fs = addr;
833 			if (doit) {
834 				/* set the selector to 0 to not confuse
835 				   __switch_to */
836 				asm volatile("movl %0,%%fs" :: "r" (0));
837 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
838 			}
839 		}
840 		put_cpu();
841 		break;
842 	case ARCH_GET_FS: {
843 		unsigned long base;
844 		if (task->thread.fsindex == FS_TLS_SEL)
845 			base = read_32bit_tls(task, FS_TLS);
846 		else if (doit)
847 			rdmsrl(MSR_FS_BASE, base);
848 		else
849 			base = task->thread.fs;
850 		ret = put_user(base, (unsigned long __user *)addr);
851 		break;
852 	}
853 	case ARCH_GET_GS: {
854 		unsigned long base;
855 		unsigned gsindex;
856 		if (task->thread.gsindex == GS_TLS_SEL)
857 			base = read_32bit_tls(task, GS_TLS);
858 		else if (doit) {
859  			asm("movl %%gs,%0" : "=r" (gsindex));
860 			if (gsindex)
861 				rdmsrl(MSR_KERNEL_GS_BASE, base);
862 			else
863 				base = task->thread.gs;
864 		}
865 		else
866 			base = task->thread.gs;
867 		ret = put_user(base, (unsigned long __user *)addr);
868 		break;
869 	}
870 
871 	default:
872 		ret = -EINVAL;
873 		break;
874 	}
875 
876 	return ret;
877 }
878 
879 long sys_arch_prctl(int code, unsigned long addr)
880 {
881 	return do_arch_prctl(current, code, addr);
882 }
883 
884 /*
885  * Capture the user space registers if the task is not running (in user space)
886  */
887 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
888 {
889 	struct pt_regs *pp, ptregs;
890 
891 	pp = task_pt_regs(tsk);
892 
893 	ptregs = *pp;
894 	ptregs.cs &= 0xffff;
895 	ptregs.ss &= 0xffff;
896 
897 	elf_core_copy_regs(regs, &ptregs);
898 
899 	return 1;
900 }
901 
902 unsigned long arch_align_stack(unsigned long sp)
903 {
904 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
905 		sp -= get_random_int() % 8192;
906 	return sp & ~0xf;
907 }
908