xref: /linux/arch/x86/kernel/process_64.c (revision 7f3edee81fbd49114c28057512906f169caa0bed)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/fs.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/ptrace.h>
34 #include <linux/utsname.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/pda.h>
49 #include <asm/prctl.h>
50 #include <asm/desc.h>
51 #include <asm/proto.h>
52 #include <asm/ia32.h>
53 #include <asm/idle.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
61 
62 /*
63  * Powermanagement idle function, if any..
64  */
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 
71 void idle_notifier_register(struct notifier_block *n)
72 {
73 	atomic_notifier_chain_register(&idle_notifier, n);
74 }
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 
77 void idle_notifier_unregister(struct notifier_block *n)
78 {
79 	atomic_notifier_chain_unregister(&idle_notifier, n);
80 }
81 EXPORT_SYMBOL(idle_notifier_unregister);
82 
83 void enter_idle(void)
84 {
85 	write_pda(isidle, 1);
86 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
87 }
88 
89 static void __exit_idle(void)
90 {
91 	if (test_and_clear_bit_pda(0, isidle) == 0)
92 		return;
93 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
94 }
95 
96 /* Called from interrupts to signify idle end */
97 void exit_idle(void)
98 {
99 	/* idle loop has pid 0 */
100 	if (current->pid)
101 		return;
102 	__exit_idle();
103 }
104 
105 /*
106  * We use this if we don't have any better
107  * idle routine..
108  */
109 static void default_idle(void)
110 {
111 	current_thread_info()->status &= ~TS_POLLING;
112 	/*
113 	 * TS_POLLING-cleared state must be visible before we
114 	 * test NEED_RESCHED:
115 	 */
116 	smp_mb();
117 	local_irq_disable();
118 	if (!need_resched()) {
119 		/* Enables interrupts one instruction before HLT.
120 		   x86 special cases this so there is no race. */
121 		safe_halt();
122 	} else
123 		local_irq_enable();
124 	current_thread_info()->status |= TS_POLLING;
125 }
126 
127 /*
128  * On SMP it's slightly faster (but much more power-consuming!)
129  * to poll the ->need_resched flag instead of waiting for the
130  * cross-CPU IPI to arrive. Use this option with caution.
131  */
132 static void poll_idle (void)
133 {
134 	local_irq_enable();
135 	cpu_relax();
136 }
137 
138 static void do_nothing(void *unused)
139 {
140 }
141 
142 void cpu_idle_wait(void)
143 {
144 	unsigned int cpu, this_cpu = get_cpu();
145 	cpumask_t map, tmp = current->cpus_allowed;
146 
147 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
148 	put_cpu();
149 
150 	cpus_clear(map);
151 	for_each_online_cpu(cpu) {
152 		per_cpu(cpu_idle_state, cpu) = 1;
153 		cpu_set(cpu, map);
154 	}
155 
156 	__get_cpu_var(cpu_idle_state) = 0;
157 
158 	wmb();
159 	do {
160 		ssleep(1);
161 		for_each_online_cpu(cpu) {
162 			if (cpu_isset(cpu, map) &&
163 					!per_cpu(cpu_idle_state, cpu))
164 				cpu_clear(cpu, map);
165 		}
166 		cpus_and(map, map, cpu_online_map);
167 		/*
168 		 * We waited 1 sec, if a CPU still did not call idle
169 		 * it may be because it is in idle and not waking up
170 		 * because it has nothing to do.
171 		 * Give all the remaining CPUS a kick.
172 		 */
173 		smp_call_function_mask(map, do_nothing, 0, 0);
174 	} while (!cpus_empty(map));
175 
176 	set_cpus_allowed(current, tmp);
177 }
178 EXPORT_SYMBOL_GPL(cpu_idle_wait);
179 
180 #ifdef CONFIG_HOTPLUG_CPU
181 DECLARE_PER_CPU(int, cpu_state);
182 
183 #include <asm/nmi.h>
184 /* We halt the CPU with physical CPU hotplug */
185 static inline void play_dead(void)
186 {
187 	idle_task_exit();
188 	wbinvd();
189 	mb();
190 	/* Ack it */
191 	__get_cpu_var(cpu_state) = CPU_DEAD;
192 
193 	local_irq_disable();
194 	while (1)
195 		halt();
196 }
197 #else
198 static inline void play_dead(void)
199 {
200 	BUG();
201 }
202 #endif /* CONFIG_HOTPLUG_CPU */
203 
204 /*
205  * The idle thread. There's no useful work to be
206  * done, so just try to conserve power and have a
207  * low exit latency (ie sit in a loop waiting for
208  * somebody to say that they'd like to reschedule)
209  */
210 void cpu_idle (void)
211 {
212 	current_thread_info()->status |= TS_POLLING;
213 	/* endless idle loop with no priority at all */
214 	while (1) {
215 		while (!need_resched()) {
216 			void (*idle)(void);
217 
218 			if (__get_cpu_var(cpu_idle_state))
219 				__get_cpu_var(cpu_idle_state) = 0;
220 
221 			tick_nohz_stop_sched_tick();
222 
223 			rmb();
224 			idle = pm_idle;
225 			if (!idle)
226 				idle = default_idle;
227 			if (cpu_is_offline(smp_processor_id()))
228 				play_dead();
229 			/*
230 			 * Idle routines should keep interrupts disabled
231 			 * from here on, until they go to idle.
232 			 * Otherwise, idle callbacks can misfire.
233 			 */
234 			local_irq_disable();
235 			enter_idle();
236 			idle();
237 			/* In many cases the interrupt that ended idle
238 			   has already called exit_idle. But some idle
239 			   loops can be woken up without interrupt. */
240 			__exit_idle();
241 		}
242 
243 		tick_nohz_restart_sched_tick();
244 		preempt_enable_no_resched();
245 		schedule();
246 		preempt_disable();
247 	}
248 }
249 
250 /*
251  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
252  * which can obviate IPI to trigger checking of need_resched.
253  * We execute MONITOR against need_resched and enter optimized wait state
254  * through MWAIT. Whenever someone changes need_resched, we would be woken
255  * up from MWAIT (without an IPI).
256  *
257  * New with Core Duo processors, MWAIT can take some hints based on CPU
258  * capability.
259  */
260 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
261 {
262 	if (!need_resched()) {
263 		__monitor((void *)&current_thread_info()->flags, 0, 0);
264 		smp_mb();
265 		if (!need_resched())
266 			__mwait(eax, ecx);
267 	}
268 }
269 
270 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
271 static void mwait_idle(void)
272 {
273 	if (!need_resched()) {
274 		__monitor((void *)&current_thread_info()->flags, 0, 0);
275 		smp_mb();
276 		if (!need_resched())
277 			__sti_mwait(0, 0);
278 		else
279 			local_irq_enable();
280 	} else {
281 		local_irq_enable();
282 	}
283 }
284 
285 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
286 {
287 	static int printed;
288 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
289 		/*
290 		 * Skip, if setup has overridden idle.
291 		 * One CPU supports mwait => All CPUs supports mwait
292 		 */
293 		if (!pm_idle) {
294 			if (!printed) {
295 				printk(KERN_INFO "using mwait in idle threads.\n");
296 				printed = 1;
297 			}
298 			pm_idle = mwait_idle;
299 		}
300 	}
301 }
302 
303 static int __init idle_setup (char *str)
304 {
305 	if (!strcmp(str, "poll")) {
306 		printk("using polling idle threads.\n");
307 		pm_idle = poll_idle;
308 	} else if (!strcmp(str, "mwait"))
309 		force_mwait = 1;
310 	else
311 		return -1;
312 
313 	boot_option_idle_override = 1;
314 	return 0;
315 }
316 early_param("idle", idle_setup);
317 
318 /* Prints also some state that isn't saved in the pt_regs */
319 void __show_regs(struct pt_regs * regs)
320 {
321 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
322 	unsigned long d0, d1, d2, d3, d6, d7;
323 	unsigned int fsindex,gsindex;
324 	unsigned int ds,cs,es;
325 
326 	printk("\n");
327 	print_modules();
328 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
329 		current->pid, current->comm, print_tainted(),
330 		init_utsname()->release,
331 		(int)strcspn(init_utsname()->version, " "),
332 		init_utsname()->version);
333 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
334 	printk_address(regs->rip);
335 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
336 		regs->eflags);
337 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
338 	       regs->rax, regs->rbx, regs->rcx);
339 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
340 	       regs->rdx, regs->rsi, regs->rdi);
341 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
342 	       regs->rbp, regs->r8, regs->r9);
343 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
344 	       regs->r10, regs->r11, regs->r12);
345 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
346 	       regs->r13, regs->r14, regs->r15);
347 
348 	asm("movl %%ds,%0" : "=r" (ds));
349 	asm("movl %%cs,%0" : "=r" (cs));
350 	asm("movl %%es,%0" : "=r" (es));
351 	asm("movl %%fs,%0" : "=r" (fsindex));
352 	asm("movl %%gs,%0" : "=r" (gsindex));
353 
354 	rdmsrl(MSR_FS_BASE, fs);
355 	rdmsrl(MSR_GS_BASE, gs);
356 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
357 
358 	cr0 = read_cr0();
359 	cr2 = read_cr2();
360 	cr3 = read_cr3();
361 	cr4 = read_cr4();
362 
363 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
364 	       fs,fsindex,gs,gsindex,shadowgs);
365 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
366 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
367 
368 	get_debugreg(d0, 0);
369 	get_debugreg(d1, 1);
370 	get_debugreg(d2, 2);
371 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
372 	get_debugreg(d3, 3);
373 	get_debugreg(d6, 6);
374 	get_debugreg(d7, 7);
375 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
376 }
377 
378 void show_regs(struct pt_regs *regs)
379 {
380 	printk("CPU %d:", smp_processor_id());
381 	__show_regs(regs);
382 	show_trace(NULL, regs, (void *)(regs + 1));
383 }
384 
385 /*
386  * Free current thread data structures etc..
387  */
388 void exit_thread(void)
389 {
390 	struct task_struct *me = current;
391 	struct thread_struct *t = &me->thread;
392 
393 	if (me->thread.io_bitmap_ptr) {
394 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
395 
396 		kfree(t->io_bitmap_ptr);
397 		t->io_bitmap_ptr = NULL;
398 		clear_thread_flag(TIF_IO_BITMAP);
399 		/*
400 		 * Careful, clear this in the TSS too:
401 		 */
402 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
403 		t->io_bitmap_max = 0;
404 		put_cpu();
405 	}
406 }
407 
408 void flush_thread(void)
409 {
410 	struct task_struct *tsk = current;
411 
412 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
413 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
414 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
415 			clear_tsk_thread_flag(tsk, TIF_IA32);
416 		} else {
417 			set_tsk_thread_flag(tsk, TIF_IA32);
418 			current_thread_info()->status |= TS_COMPAT;
419 		}
420 	}
421 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
422 
423 	tsk->thread.debugreg0 = 0;
424 	tsk->thread.debugreg1 = 0;
425 	tsk->thread.debugreg2 = 0;
426 	tsk->thread.debugreg3 = 0;
427 	tsk->thread.debugreg6 = 0;
428 	tsk->thread.debugreg7 = 0;
429 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
430 	/*
431 	 * Forget coprocessor state..
432 	 */
433 	clear_fpu(tsk);
434 	clear_used_math();
435 }
436 
437 void release_thread(struct task_struct *dead_task)
438 {
439 	if (dead_task->mm) {
440 		if (dead_task->mm->context.size) {
441 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
442 					dead_task->comm,
443 					dead_task->mm->context.ldt,
444 					dead_task->mm->context.size);
445 			BUG();
446 		}
447 	}
448 }
449 
450 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
451 {
452 	struct user_desc ud = {
453 		.base_addr = addr,
454 		.limit = 0xfffff,
455 		.seg_32bit = 1,
456 		.limit_in_pages = 1,
457 		.useable = 1,
458 	};
459 	struct n_desc_struct *desc = (void *)t->thread.tls_array;
460 	desc += tls;
461 	desc->a = LDT_entry_a(&ud);
462 	desc->b = LDT_entry_b(&ud);
463 }
464 
465 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
466 {
467 	struct desc_struct *desc = (void *)t->thread.tls_array;
468 	desc += tls;
469 	return desc->base0 |
470 		(((u32)desc->base1) << 16) |
471 		(((u32)desc->base2) << 24);
472 }
473 
474 /*
475  * This gets called before we allocate a new thread and copy
476  * the current task into it.
477  */
478 void prepare_to_copy(struct task_struct *tsk)
479 {
480 	unlazy_fpu(tsk);
481 }
482 
483 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
484 		unsigned long unused,
485 	struct task_struct * p, struct pt_regs * regs)
486 {
487 	int err;
488 	struct pt_regs * childregs;
489 	struct task_struct *me = current;
490 
491 	childregs = ((struct pt_regs *)
492 			(THREAD_SIZE + task_stack_page(p))) - 1;
493 	*childregs = *regs;
494 
495 	childregs->rax = 0;
496 	childregs->rsp = rsp;
497 	if (rsp == ~0UL)
498 		childregs->rsp = (unsigned long)childregs;
499 
500 	p->thread.rsp = (unsigned long) childregs;
501 	p->thread.rsp0 = (unsigned long) (childregs+1);
502 	p->thread.userrsp = me->thread.userrsp;
503 
504 	set_tsk_thread_flag(p, TIF_FORK);
505 
506 	p->thread.fs = me->thread.fs;
507 	p->thread.gs = me->thread.gs;
508 
509 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
510 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
511 	asm("mov %%es,%0" : "=m" (p->thread.es));
512 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
513 
514 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
515 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
516 		if (!p->thread.io_bitmap_ptr) {
517 			p->thread.io_bitmap_max = 0;
518 			return -ENOMEM;
519 		}
520 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
521 				IO_BITMAP_BYTES);
522 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
523 	}
524 
525 	/*
526 	 * Set a new TLS for the child thread?
527 	 */
528 	if (clone_flags & CLONE_SETTLS) {
529 #ifdef CONFIG_IA32_EMULATION
530 		if (test_thread_flag(TIF_IA32))
531 			err = ia32_child_tls(p, childregs);
532 		else
533 #endif
534 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
535 		if (err)
536 			goto out;
537 	}
538 	err = 0;
539 out:
540 	if (err && p->thread.io_bitmap_ptr) {
541 		kfree(p->thread.io_bitmap_ptr);
542 		p->thread.io_bitmap_max = 0;
543 	}
544 	return err;
545 }
546 
547 /*
548  * This special macro can be used to load a debugging register
549  */
550 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
551 
552 static inline void __switch_to_xtra(struct task_struct *prev_p,
553 			     	    struct task_struct *next_p,
554 			     	    struct tss_struct *tss)
555 {
556 	struct thread_struct *prev, *next;
557 
558 	prev = &prev_p->thread,
559 	next = &next_p->thread;
560 
561 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
562 		loaddebug(next, 0);
563 		loaddebug(next, 1);
564 		loaddebug(next, 2);
565 		loaddebug(next, 3);
566 		/* no 4 and 5 */
567 		loaddebug(next, 6);
568 		loaddebug(next, 7);
569 	}
570 
571 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
572 		/*
573 		 * Copy the relevant range of the IO bitmap.
574 		 * Normally this is 128 bytes or less:
575 		 */
576 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
577 		       max(prev->io_bitmap_max, next->io_bitmap_max));
578 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
579 		/*
580 		 * Clear any possible leftover bits:
581 		 */
582 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
583 	}
584 }
585 
586 /*
587  *	switch_to(x,y) should switch tasks from x to y.
588  *
589  * This could still be optimized:
590  * - fold all the options into a flag word and test it with a single test.
591  * - could test fs/gs bitsliced
592  *
593  * Kprobes not supported here. Set the probe on schedule instead.
594  */
595 struct task_struct *
596 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
597 {
598 	struct thread_struct *prev = &prev_p->thread,
599 				 *next = &next_p->thread;
600 	int cpu = smp_processor_id();
601 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
602 
603 	/* we're going to use this soon, after a few expensive things */
604 	if (next_p->fpu_counter>5)
605 		prefetch(&next->i387.fxsave);
606 
607 	/*
608 	 * Reload esp0, LDT and the page table pointer:
609 	 */
610 	tss->rsp0 = next->rsp0;
611 
612 	/*
613 	 * Switch DS and ES.
614 	 * This won't pick up thread selector changes, but I guess that is ok.
615 	 */
616 	asm volatile("mov %%es,%0" : "=m" (prev->es));
617 	if (unlikely(next->es | prev->es))
618 		loadsegment(es, next->es);
619 
620 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
621 	if (unlikely(next->ds | prev->ds))
622 		loadsegment(ds, next->ds);
623 
624 	load_TLS(next, cpu);
625 
626 	/*
627 	 * Switch FS and GS.
628 	 */
629 	{
630 		unsigned fsindex;
631 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
632 		/* segment register != 0 always requires a reload.
633 		   also reload when it has changed.
634 		   when prev process used 64bit base always reload
635 		   to avoid an information leak. */
636 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
637 			loadsegment(fs, next->fsindex);
638 			/* check if the user used a selector != 0
639 	                 * if yes clear 64bit base, since overloaded base
640                          * is always mapped to the Null selector
641                          */
642 			if (fsindex)
643 			prev->fs = 0;
644 		}
645 		/* when next process has a 64bit base use it */
646 		if (next->fs)
647 			wrmsrl(MSR_FS_BASE, next->fs);
648 		prev->fsindex = fsindex;
649 	}
650 	{
651 		unsigned gsindex;
652 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
653 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
654 			load_gs_index(next->gsindex);
655 			if (gsindex)
656 			prev->gs = 0;
657 		}
658 		if (next->gs)
659 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
660 		prev->gsindex = gsindex;
661 	}
662 
663 	/* Must be after DS reload */
664 	unlazy_fpu(prev_p);
665 
666 	/*
667 	 * Switch the PDA and FPU contexts.
668 	 */
669 	prev->userrsp = read_pda(oldrsp);
670 	write_pda(oldrsp, next->userrsp);
671 	write_pda(pcurrent, next_p);
672 
673 	write_pda(kernelstack,
674 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
675 #ifdef CONFIG_CC_STACKPROTECTOR
676 	write_pda(stack_canary, next_p->stack_canary);
677 	/*
678 	 * Build time only check to make sure the stack_canary is at
679 	 * offset 40 in the pda; this is a gcc ABI requirement
680 	 */
681 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
682 #endif
683 
684 	/*
685 	 * Now maybe reload the debug registers and handle I/O bitmaps
686 	 */
687 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
688 	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
689 		__switch_to_xtra(prev_p, next_p, tss);
690 
691 	/* If the task has used fpu the last 5 timeslices, just do a full
692 	 * restore of the math state immediately to avoid the trap; the
693 	 * chances of needing FPU soon are obviously high now
694 	 */
695 	if (next_p->fpu_counter>5)
696 		math_state_restore();
697 	return prev_p;
698 }
699 
700 /*
701  * sys_execve() executes a new program.
702  */
703 asmlinkage
704 long sys_execve(char __user *name, char __user * __user *argv,
705 		char __user * __user *envp, struct pt_regs regs)
706 {
707 	long error;
708 	char * filename;
709 
710 	filename = getname(name);
711 	error = PTR_ERR(filename);
712 	if (IS_ERR(filename))
713 		return error;
714 	error = do_execve(filename, argv, envp, &regs);
715 	if (error == 0) {
716 		task_lock(current);
717 		current->ptrace &= ~PT_DTRACE;
718 		task_unlock(current);
719 	}
720 	putname(filename);
721 	return error;
722 }
723 
724 void set_personality_64bit(void)
725 {
726 	/* inherit personality from parent */
727 
728 	/* Make sure to be in 64bit mode */
729 	clear_thread_flag(TIF_IA32);
730 
731 	/* TBD: overwrites user setup. Should have two bits.
732 	   But 64bit processes have always behaved this way,
733 	   so it's not too bad. The main problem is just that
734    	   32bit childs are affected again. */
735 	current->personality &= ~READ_IMPLIES_EXEC;
736 }
737 
738 asmlinkage long sys_fork(struct pt_regs *regs)
739 {
740 	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
741 }
742 
743 asmlinkage long
744 sys_clone(unsigned long clone_flags, unsigned long newsp,
745 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
746 {
747 	if (!newsp)
748 		newsp = regs->rsp;
749 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
750 }
751 
752 /*
753  * This is trivial, and on the face of it looks like it
754  * could equally well be done in user mode.
755  *
756  * Not so, for quite unobvious reasons - register pressure.
757  * In user mode vfork() cannot have a stack frame, and if
758  * done by calling the "clone()" system call directly, you
759  * do not have enough call-clobbered registers to hold all
760  * the information you need.
761  */
762 asmlinkage long sys_vfork(struct pt_regs *regs)
763 {
764 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
765 		    NULL, NULL);
766 }
767 
768 unsigned long get_wchan(struct task_struct *p)
769 {
770 	unsigned long stack;
771 	u64 fp,rip;
772 	int count = 0;
773 
774 	if (!p || p == current || p->state==TASK_RUNNING)
775 		return 0;
776 	stack = (unsigned long)task_stack_page(p);
777 	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
778 		return 0;
779 	fp = *(u64 *)(p->thread.rsp);
780 	do {
781 		if (fp < (unsigned long)stack ||
782 		    fp > (unsigned long)stack+THREAD_SIZE)
783 			return 0;
784 		rip = *(u64 *)(fp+8);
785 		if (!in_sched_functions(rip))
786 			return rip;
787 		fp = *(u64 *)fp;
788 	} while (count++ < 16);
789 	return 0;
790 }
791 
792 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
793 {
794 	int ret = 0;
795 	int doit = task == current;
796 	int cpu;
797 
798 	switch (code) {
799 	case ARCH_SET_GS:
800 		if (addr >= TASK_SIZE_OF(task))
801 			return -EPERM;
802 		cpu = get_cpu();
803 		/* handle small bases via the GDT because that's faster to
804 		   switch. */
805 		if (addr <= 0xffffffff) {
806 			set_32bit_tls(task, GS_TLS, addr);
807 			if (doit) {
808 				load_TLS(&task->thread, cpu);
809 				load_gs_index(GS_TLS_SEL);
810 			}
811 			task->thread.gsindex = GS_TLS_SEL;
812 			task->thread.gs = 0;
813 		} else {
814 			task->thread.gsindex = 0;
815 			task->thread.gs = addr;
816 			if (doit) {
817 				load_gs_index(0);
818 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
819 			}
820 		}
821 		put_cpu();
822 		break;
823 	case ARCH_SET_FS:
824 		/* Not strictly needed for fs, but do it for symmetry
825 		   with gs */
826 		if (addr >= TASK_SIZE_OF(task))
827 			return -EPERM;
828 		cpu = get_cpu();
829 		/* handle small bases via the GDT because that's faster to
830 		   switch. */
831 		if (addr <= 0xffffffff) {
832 			set_32bit_tls(task, FS_TLS, addr);
833 			if (doit) {
834 				load_TLS(&task->thread, cpu);
835 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
836 			}
837 			task->thread.fsindex = FS_TLS_SEL;
838 			task->thread.fs = 0;
839 		} else {
840 			task->thread.fsindex = 0;
841 			task->thread.fs = addr;
842 			if (doit) {
843 				/* set the selector to 0 to not confuse
844 				   __switch_to */
845 				asm volatile("movl %0,%%fs" :: "r" (0));
846 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
847 			}
848 		}
849 		put_cpu();
850 		break;
851 	case ARCH_GET_FS: {
852 		unsigned long base;
853 		if (task->thread.fsindex == FS_TLS_SEL)
854 			base = read_32bit_tls(task, FS_TLS);
855 		else if (doit)
856 			rdmsrl(MSR_FS_BASE, base);
857 		else
858 			base = task->thread.fs;
859 		ret = put_user(base, (unsigned long __user *)addr);
860 		break;
861 	}
862 	case ARCH_GET_GS: {
863 		unsigned long base;
864 		unsigned gsindex;
865 		if (task->thread.gsindex == GS_TLS_SEL)
866 			base = read_32bit_tls(task, GS_TLS);
867 		else if (doit) {
868  			asm("movl %%gs,%0" : "=r" (gsindex));
869 			if (gsindex)
870 				rdmsrl(MSR_KERNEL_GS_BASE, base);
871 			else
872 				base = task->thread.gs;
873 		}
874 		else
875 			base = task->thread.gs;
876 		ret = put_user(base, (unsigned long __user *)addr);
877 		break;
878 	}
879 
880 	default:
881 		ret = -EINVAL;
882 		break;
883 	}
884 
885 	return ret;
886 }
887 
888 long sys_arch_prctl(int code, unsigned long addr)
889 {
890 	return do_arch_prctl(current, code, addr);
891 }
892 
893 /*
894  * Capture the user space registers if the task is not running (in user space)
895  */
896 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
897 {
898 	struct pt_regs *pp, ptregs;
899 
900 	pp = task_pt_regs(tsk);
901 
902 	ptregs = *pp;
903 	ptregs.cs &= 0xffff;
904 	ptregs.ss &= 0xffff;
905 
906 	elf_core_copy_regs(regs, &ptregs);
907 
908 	return 1;
909 }
910 
911 unsigned long arch_align_stack(unsigned long sp)
912 {
913 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
914 		sp -= get_random_int() % 8192;
915 	return sp & ~0xf;
916 }
917