xref: /linux/arch/x86/kernel/process_64.c (revision 643d1f7fe3aa12c8bdea6fa5b4ba874ff6dd601d)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/pda.h>
49 #include <asm/prctl.h>
50 #include <asm/desc.h>
51 #include <asm/proto.h>
52 #include <asm/ia32.h>
53 #include <asm/idle.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
61 
62 /*
63  * Powermanagement idle function, if any..
64  */
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 
71 void idle_notifier_register(struct notifier_block *n)
72 {
73 	atomic_notifier_chain_register(&idle_notifier, n);
74 }
75 
76 void enter_idle(void)
77 {
78 	write_pda(isidle, 1);
79 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
80 }
81 
82 static void __exit_idle(void)
83 {
84 	if (test_and_clear_bit_pda(0, isidle) == 0)
85 		return;
86 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
87 }
88 
89 /* Called from interrupts to signify idle end */
90 void exit_idle(void)
91 {
92 	/* idle loop has pid 0 */
93 	if (current->pid)
94 		return;
95 	__exit_idle();
96 }
97 
98 /*
99  * We use this if we don't have any better
100  * idle routine..
101  */
102 void default_idle(void)
103 {
104 	current_thread_info()->status &= ~TS_POLLING;
105 	/*
106 	 * TS_POLLING-cleared state must be visible before we
107 	 * test NEED_RESCHED:
108 	 */
109 	smp_mb();
110 	local_irq_disable();
111 	if (!need_resched()) {
112 		ktime_t t0, t1;
113 		u64 t0n, t1n;
114 
115 		t0 = ktime_get();
116 		t0n = ktime_to_ns(t0);
117 		safe_halt();	/* enables interrupts racelessly */
118 		local_irq_disable();
119 		t1 = ktime_get();
120 		t1n = ktime_to_ns(t1);
121 		sched_clock_idle_wakeup_event(t1n - t0n);
122 	}
123 	local_irq_enable();
124 	current_thread_info()->status |= TS_POLLING;
125 }
126 
127 /*
128  * On SMP it's slightly faster (but much more power-consuming!)
129  * to poll the ->need_resched flag instead of waiting for the
130  * cross-CPU IPI to arrive. Use this option with caution.
131  */
132 static void poll_idle(void)
133 {
134 	local_irq_enable();
135 	cpu_relax();
136 }
137 
138 #ifdef CONFIG_HOTPLUG_CPU
139 DECLARE_PER_CPU(int, cpu_state);
140 
141 #include <asm/nmi.h>
142 /* We halt the CPU with physical CPU hotplug */
143 static inline void play_dead(void)
144 {
145 	idle_task_exit();
146 	wbinvd();
147 	mb();
148 	/* Ack it */
149 	__get_cpu_var(cpu_state) = CPU_DEAD;
150 
151 	local_irq_disable();
152 	while (1)
153 		halt();
154 }
155 #else
156 static inline void play_dead(void)
157 {
158 	BUG();
159 }
160 #endif /* CONFIG_HOTPLUG_CPU */
161 
162 /*
163  * The idle thread. There's no useful work to be
164  * done, so just try to conserve power and have a
165  * low exit latency (ie sit in a loop waiting for
166  * somebody to say that they'd like to reschedule)
167  */
168 void cpu_idle(void)
169 {
170 	current_thread_info()->status |= TS_POLLING;
171 	/* endless idle loop with no priority at all */
172 	while (1) {
173 		tick_nohz_stop_sched_tick();
174 		while (!need_resched()) {
175 			void (*idle)(void);
176 
177 			if (__get_cpu_var(cpu_idle_state))
178 				__get_cpu_var(cpu_idle_state) = 0;
179 
180 			rmb();
181 			idle = pm_idle;
182 			if (!idle)
183 				idle = default_idle;
184 			if (cpu_is_offline(smp_processor_id()))
185 				play_dead();
186 			/*
187 			 * Idle routines should keep interrupts disabled
188 			 * from here on, until they go to idle.
189 			 * Otherwise, idle callbacks can misfire.
190 			 */
191 			local_irq_disable();
192 			enter_idle();
193 			idle();
194 			/* In many cases the interrupt that ended idle
195 			   has already called exit_idle. But some idle
196 			   loops can be woken up without interrupt. */
197 			__exit_idle();
198 		}
199 
200 		tick_nohz_restart_sched_tick();
201 		preempt_enable_no_resched();
202 		schedule();
203 		preempt_disable();
204 	}
205 }
206 
207 static void do_nothing(void *unused)
208 {
209 }
210 
211 void cpu_idle_wait(void)
212 {
213 	unsigned int cpu, this_cpu = get_cpu();
214 	cpumask_t map, tmp = current->cpus_allowed;
215 
216 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
217 	put_cpu();
218 
219 	cpus_clear(map);
220 	for_each_online_cpu(cpu) {
221 		per_cpu(cpu_idle_state, cpu) = 1;
222 		cpu_set(cpu, map);
223 	}
224 
225 	__get_cpu_var(cpu_idle_state) = 0;
226 
227 	wmb();
228 	do {
229 		ssleep(1);
230 		for_each_online_cpu(cpu) {
231 			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
232 				cpu_clear(cpu, map);
233 		}
234 		cpus_and(map, map, cpu_online_map);
235 		/*
236 		 * We waited 1 sec, if a CPU still did not call idle
237 		 * it may be because it is in idle and not waking up
238 		 * because it has nothing to do.
239 		 * Give all the remaining CPUS a kick.
240 		 */
241 		smp_call_function_mask(map, do_nothing, 0, 0);
242 	} while (!cpus_empty(map));
243 
244 	set_cpus_allowed(current, tmp);
245 }
246 EXPORT_SYMBOL_GPL(cpu_idle_wait);
247 
248 /*
249  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
250  * which can obviate IPI to trigger checking of need_resched.
251  * We execute MONITOR against need_resched and enter optimized wait state
252  * through MWAIT. Whenever someone changes need_resched, we would be woken
253  * up from MWAIT (without an IPI).
254  *
255  * New with Core Duo processors, MWAIT can take some hints based on CPU
256  * capability.
257  */
258 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
259 {
260 	if (!need_resched()) {
261 		__monitor((void *)&current_thread_info()->flags, 0, 0);
262 		smp_mb();
263 		if (!need_resched())
264 			__mwait(ax, cx);
265 	}
266 }
267 
268 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
269 static void mwait_idle(void)
270 {
271 	if (!need_resched()) {
272 		__monitor((void *)&current_thread_info()->flags, 0, 0);
273 		smp_mb();
274 		if (!need_resched())
275 			__sti_mwait(0, 0);
276 		else
277 			local_irq_enable();
278 	} else {
279 		local_irq_enable();
280 	}
281 }
282 
283 
284 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
285 {
286 	if (force_mwait)
287 		return 1;
288 	/* Any C1 states supported? */
289 	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
290 }
291 
292 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
293 {
294 	static int selected;
295 
296 	if (selected)
297 		return;
298 #ifdef CONFIG_X86_SMP
299 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
300 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
301 			" performance may degrade.\n");
302 	}
303 #endif
304 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
305 		/*
306 		 * Skip, if setup has overridden idle.
307 		 * One CPU supports mwait => All CPUs supports mwait
308 		 */
309 		if (!pm_idle) {
310 			printk(KERN_INFO "using mwait in idle threads.\n");
311 			pm_idle = mwait_idle;
312 		}
313 	}
314 	selected = 1;
315 }
316 
317 static int __init idle_setup(char *str)
318 {
319 	if (!strcmp(str, "poll")) {
320 		printk("using polling idle threads.\n");
321 		pm_idle = poll_idle;
322 	} else if (!strcmp(str, "mwait"))
323 		force_mwait = 1;
324 	else
325 		return -1;
326 
327 	boot_option_idle_override = 1;
328 	return 0;
329 }
330 early_param("idle", idle_setup);
331 
332 /* Prints also some state that isn't saved in the pt_regs */
333 void __show_regs(struct pt_regs * regs)
334 {
335 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
336 	unsigned long d0, d1, d2, d3, d6, d7;
337 	unsigned int fsindex, gsindex;
338 	unsigned int ds, cs, es;
339 
340 	printk("\n");
341 	print_modules();
342 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
343 		current->pid, current->comm, print_tainted(),
344 		init_utsname()->release,
345 		(int)strcspn(init_utsname()->version, " "),
346 		init_utsname()->version);
347 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
348 	printk_address(regs->ip, 1);
349 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
350 		regs->flags);
351 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
352 	       regs->ax, regs->bx, regs->cx);
353 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
354 	       regs->dx, regs->si, regs->di);
355 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
356 	       regs->bp, regs->r8, regs->r9);
357 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
358 	       regs->r10, regs->r11, regs->r12);
359 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
360 	       regs->r13, regs->r14, regs->r15);
361 
362 	asm("movl %%ds,%0" : "=r" (ds));
363 	asm("movl %%cs,%0" : "=r" (cs));
364 	asm("movl %%es,%0" : "=r" (es));
365 	asm("movl %%fs,%0" : "=r" (fsindex));
366 	asm("movl %%gs,%0" : "=r" (gsindex));
367 
368 	rdmsrl(MSR_FS_BASE, fs);
369 	rdmsrl(MSR_GS_BASE, gs);
370 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
371 
372 	cr0 = read_cr0();
373 	cr2 = read_cr2();
374 	cr3 = read_cr3();
375 	cr4 = read_cr4();
376 
377 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
378 	       fs,fsindex,gs,gsindex,shadowgs);
379 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
380 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
381 
382 	get_debugreg(d0, 0);
383 	get_debugreg(d1, 1);
384 	get_debugreg(d2, 2);
385 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
386 	get_debugreg(d3, 3);
387 	get_debugreg(d6, 6);
388 	get_debugreg(d7, 7);
389 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
390 }
391 
392 void show_regs(struct pt_regs *regs)
393 {
394 	printk("CPU %d:", smp_processor_id());
395 	__show_regs(regs);
396 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
397 }
398 
399 /*
400  * Free current thread data structures etc..
401  */
402 void exit_thread(void)
403 {
404 	struct task_struct *me = current;
405 	struct thread_struct *t = &me->thread;
406 
407 	if (me->thread.io_bitmap_ptr) {
408 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
409 
410 		kfree(t->io_bitmap_ptr);
411 		t->io_bitmap_ptr = NULL;
412 		clear_thread_flag(TIF_IO_BITMAP);
413 		/*
414 		 * Careful, clear this in the TSS too:
415 		 */
416 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
417 		t->io_bitmap_max = 0;
418 		put_cpu();
419 	}
420 }
421 
422 void flush_thread(void)
423 {
424 	struct task_struct *tsk = current;
425 
426 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
427 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
428 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
429 			clear_tsk_thread_flag(tsk, TIF_IA32);
430 		} else {
431 			set_tsk_thread_flag(tsk, TIF_IA32);
432 			current_thread_info()->status |= TS_COMPAT;
433 		}
434 	}
435 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
436 
437 	tsk->thread.debugreg0 = 0;
438 	tsk->thread.debugreg1 = 0;
439 	tsk->thread.debugreg2 = 0;
440 	tsk->thread.debugreg3 = 0;
441 	tsk->thread.debugreg6 = 0;
442 	tsk->thread.debugreg7 = 0;
443 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
444 	/*
445 	 * Forget coprocessor state..
446 	 */
447 	clear_fpu(tsk);
448 	clear_used_math();
449 }
450 
451 void release_thread(struct task_struct *dead_task)
452 {
453 	if (dead_task->mm) {
454 		if (dead_task->mm->context.size) {
455 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
456 					dead_task->comm,
457 					dead_task->mm->context.ldt,
458 					dead_task->mm->context.size);
459 			BUG();
460 		}
461 	}
462 }
463 
464 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
465 {
466 	struct user_desc ud = {
467 		.base_addr = addr,
468 		.limit = 0xfffff,
469 		.seg_32bit = 1,
470 		.limit_in_pages = 1,
471 		.useable = 1,
472 	};
473 	struct desc_struct *desc = t->thread.tls_array;
474 	desc += tls;
475 	fill_ldt(desc, &ud);
476 }
477 
478 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
479 {
480 	return get_desc_base(&t->thread.tls_array[tls]);
481 }
482 
483 /*
484  * This gets called before we allocate a new thread and copy
485  * the current task into it.
486  */
487 void prepare_to_copy(struct task_struct *tsk)
488 {
489 	unlazy_fpu(tsk);
490 }
491 
492 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
493 		unsigned long unused,
494 	struct task_struct * p, struct pt_regs * regs)
495 {
496 	int err;
497 	struct pt_regs * childregs;
498 	struct task_struct *me = current;
499 
500 	childregs = ((struct pt_regs *)
501 			(THREAD_SIZE + task_stack_page(p))) - 1;
502 	*childregs = *regs;
503 
504 	childregs->ax = 0;
505 	childregs->sp = sp;
506 	if (sp == ~0UL)
507 		childregs->sp = (unsigned long)childregs;
508 
509 	p->thread.sp = (unsigned long) childregs;
510 	p->thread.sp0 = (unsigned long) (childregs+1);
511 	p->thread.usersp = me->thread.usersp;
512 
513 	set_tsk_thread_flag(p, TIF_FORK);
514 
515 	p->thread.fs = me->thread.fs;
516 	p->thread.gs = me->thread.gs;
517 
518 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
519 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
520 	asm("mov %%es,%0" : "=m" (p->thread.es));
521 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
522 
523 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
524 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
525 		if (!p->thread.io_bitmap_ptr) {
526 			p->thread.io_bitmap_max = 0;
527 			return -ENOMEM;
528 		}
529 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
530 				IO_BITMAP_BYTES);
531 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
532 	}
533 
534 	/*
535 	 * Set a new TLS for the child thread?
536 	 */
537 	if (clone_flags & CLONE_SETTLS) {
538 #ifdef CONFIG_IA32_EMULATION
539 		if (test_thread_flag(TIF_IA32))
540 			err = do_set_thread_area(p, -1,
541 				(struct user_desc __user *)childregs->si, 0);
542 		else
543 #endif
544 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
545 		if (err)
546 			goto out;
547 	}
548 	err = 0;
549 out:
550 	if (err && p->thread.io_bitmap_ptr) {
551 		kfree(p->thread.io_bitmap_ptr);
552 		p->thread.io_bitmap_max = 0;
553 	}
554 	return err;
555 }
556 
557 /*
558  * This special macro can be used to load a debugging register
559  */
560 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
561 
562 static inline void __switch_to_xtra(struct task_struct *prev_p,
563 				    struct task_struct *next_p,
564 				    struct tss_struct *tss)
565 {
566 	struct thread_struct *prev, *next;
567 	unsigned long debugctl;
568 
569 	prev = &prev_p->thread,
570 	next = &next_p->thread;
571 
572 	debugctl = prev->debugctlmsr;
573 	if (next->ds_area_msr != prev->ds_area_msr) {
574 		/* we clear debugctl to make sure DS
575 		 * is not in use when we change it */
576 		debugctl = 0;
577 		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
578 		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
579 	}
580 
581 	if (next->debugctlmsr != debugctl)
582 		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
583 
584 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
585 		loaddebug(next, 0);
586 		loaddebug(next, 1);
587 		loaddebug(next, 2);
588 		loaddebug(next, 3);
589 		/* no 4 and 5 */
590 		loaddebug(next, 6);
591 		loaddebug(next, 7);
592 	}
593 
594 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
595 		/*
596 		 * Copy the relevant range of the IO bitmap.
597 		 * Normally this is 128 bytes or less:
598 		 */
599 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
600 		       max(prev->io_bitmap_max, next->io_bitmap_max));
601 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
602 		/*
603 		 * Clear any possible leftover bits:
604 		 */
605 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
606 	}
607 
608 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
609 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
610 
611 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
612 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
613 }
614 
615 /*
616  *	switch_to(x,y) should switch tasks from x to y.
617  *
618  * This could still be optimized:
619  * - fold all the options into a flag word and test it with a single test.
620  * - could test fs/gs bitsliced
621  *
622  * Kprobes not supported here. Set the probe on schedule instead.
623  */
624 struct task_struct *
625 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
626 {
627 	struct thread_struct *prev = &prev_p->thread,
628 				 *next = &next_p->thread;
629 	int cpu = smp_processor_id();
630 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
631 
632 	/* we're going to use this soon, after a few expensive things */
633 	if (next_p->fpu_counter>5)
634 		prefetch(&next->i387.fxsave);
635 
636 	/*
637 	 * Reload esp0, LDT and the page table pointer:
638 	 */
639 	load_sp0(tss, next);
640 
641 	/*
642 	 * Switch DS and ES.
643 	 * This won't pick up thread selector changes, but I guess that is ok.
644 	 */
645 	asm volatile("mov %%es,%0" : "=m" (prev->es));
646 	if (unlikely(next->es | prev->es))
647 		loadsegment(es, next->es);
648 
649 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
650 	if (unlikely(next->ds | prev->ds))
651 		loadsegment(ds, next->ds);
652 
653 	load_TLS(next, cpu);
654 
655 	/*
656 	 * Switch FS and GS.
657 	 */
658 	{
659 		unsigned fsindex;
660 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
661 		/* segment register != 0 always requires a reload.
662 		   also reload when it has changed.
663 		   when prev process used 64bit base always reload
664 		   to avoid an information leak. */
665 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
666 			loadsegment(fs, next->fsindex);
667 			/* check if the user used a selector != 0
668 	                 * if yes clear 64bit base, since overloaded base
669                          * is always mapped to the Null selector
670                          */
671 			if (fsindex)
672 			prev->fs = 0;
673 		}
674 		/* when next process has a 64bit base use it */
675 		if (next->fs)
676 			wrmsrl(MSR_FS_BASE, next->fs);
677 		prev->fsindex = fsindex;
678 	}
679 	{
680 		unsigned gsindex;
681 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
682 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
683 			load_gs_index(next->gsindex);
684 			if (gsindex)
685 			prev->gs = 0;
686 		}
687 		if (next->gs)
688 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
689 		prev->gsindex = gsindex;
690 	}
691 
692 	/* Must be after DS reload */
693 	unlazy_fpu(prev_p);
694 
695 	/*
696 	 * Switch the PDA and FPU contexts.
697 	 */
698 	prev->usersp = read_pda(oldrsp);
699 	write_pda(oldrsp, next->usersp);
700 	write_pda(pcurrent, next_p);
701 
702 	write_pda(kernelstack,
703 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
704 #ifdef CONFIG_CC_STACKPROTECTOR
705 	write_pda(stack_canary, next_p->stack_canary);
706 	/*
707 	 * Build time only check to make sure the stack_canary is at
708 	 * offset 40 in the pda; this is a gcc ABI requirement
709 	 */
710 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
711 #endif
712 
713 	/*
714 	 * Now maybe reload the debug registers and handle I/O bitmaps
715 	 */
716 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
717 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
718 		__switch_to_xtra(prev_p, next_p, tss);
719 
720 	/* If the task has used fpu the last 5 timeslices, just do a full
721 	 * restore of the math state immediately to avoid the trap; the
722 	 * chances of needing FPU soon are obviously high now
723 	 */
724 	if (next_p->fpu_counter>5)
725 		math_state_restore();
726 	return prev_p;
727 }
728 
729 /*
730  * sys_execve() executes a new program.
731  */
732 asmlinkage
733 long sys_execve(char __user *name, char __user * __user *argv,
734 		char __user * __user *envp, struct pt_regs regs)
735 {
736 	long error;
737 	char * filename;
738 
739 	filename = getname(name);
740 	error = PTR_ERR(filename);
741 	if (IS_ERR(filename))
742 		return error;
743 	error = do_execve(filename, argv, envp, &regs);
744 	putname(filename);
745 	return error;
746 }
747 
748 void set_personality_64bit(void)
749 {
750 	/* inherit personality from parent */
751 
752 	/* Make sure to be in 64bit mode */
753 	clear_thread_flag(TIF_IA32);
754 
755 	/* TBD: overwrites user setup. Should have two bits.
756 	   But 64bit processes have always behaved this way,
757 	   so it's not too bad. The main problem is just that
758 	   32bit childs are affected again. */
759 	current->personality &= ~READ_IMPLIES_EXEC;
760 }
761 
762 asmlinkage long sys_fork(struct pt_regs *regs)
763 {
764 	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
765 }
766 
767 asmlinkage long
768 sys_clone(unsigned long clone_flags, unsigned long newsp,
769 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
770 {
771 	if (!newsp)
772 		newsp = regs->sp;
773 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
774 }
775 
776 /*
777  * This is trivial, and on the face of it looks like it
778  * could equally well be done in user mode.
779  *
780  * Not so, for quite unobvious reasons - register pressure.
781  * In user mode vfork() cannot have a stack frame, and if
782  * done by calling the "clone()" system call directly, you
783  * do not have enough call-clobbered registers to hold all
784  * the information you need.
785  */
786 asmlinkage long sys_vfork(struct pt_regs *regs)
787 {
788 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
789 		    NULL, NULL);
790 }
791 
792 unsigned long get_wchan(struct task_struct *p)
793 {
794 	unsigned long stack;
795 	u64 fp,ip;
796 	int count = 0;
797 
798 	if (!p || p == current || p->state==TASK_RUNNING)
799 		return 0;
800 	stack = (unsigned long)task_stack_page(p);
801 	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
802 		return 0;
803 	fp = *(u64 *)(p->thread.sp);
804 	do {
805 		if (fp < (unsigned long)stack ||
806 		    fp > (unsigned long)stack+THREAD_SIZE)
807 			return 0;
808 		ip = *(u64 *)(fp+8);
809 		if (!in_sched_functions(ip))
810 			return ip;
811 		fp = *(u64 *)fp;
812 	} while (count++ < 16);
813 	return 0;
814 }
815 
816 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
817 {
818 	int ret = 0;
819 	int doit = task == current;
820 	int cpu;
821 
822 	switch (code) {
823 	case ARCH_SET_GS:
824 		if (addr >= TASK_SIZE_OF(task))
825 			return -EPERM;
826 		cpu = get_cpu();
827 		/* handle small bases via the GDT because that's faster to
828 		   switch. */
829 		if (addr <= 0xffffffff) {
830 			set_32bit_tls(task, GS_TLS, addr);
831 			if (doit) {
832 				load_TLS(&task->thread, cpu);
833 				load_gs_index(GS_TLS_SEL);
834 			}
835 			task->thread.gsindex = GS_TLS_SEL;
836 			task->thread.gs = 0;
837 		} else {
838 			task->thread.gsindex = 0;
839 			task->thread.gs = addr;
840 			if (doit) {
841 				load_gs_index(0);
842 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
843 			}
844 		}
845 		put_cpu();
846 		break;
847 	case ARCH_SET_FS:
848 		/* Not strictly needed for fs, but do it for symmetry
849 		   with gs */
850 		if (addr >= TASK_SIZE_OF(task))
851 			return -EPERM;
852 		cpu = get_cpu();
853 		/* handle small bases via the GDT because that's faster to
854 		   switch. */
855 		if (addr <= 0xffffffff) {
856 			set_32bit_tls(task, FS_TLS, addr);
857 			if (doit) {
858 				load_TLS(&task->thread, cpu);
859 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
860 			}
861 			task->thread.fsindex = FS_TLS_SEL;
862 			task->thread.fs = 0;
863 		} else {
864 			task->thread.fsindex = 0;
865 			task->thread.fs = addr;
866 			if (doit) {
867 				/* set the selector to 0 to not confuse
868 				   __switch_to */
869 				asm volatile("movl %0,%%fs" :: "r" (0));
870 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
871 			}
872 		}
873 		put_cpu();
874 		break;
875 	case ARCH_GET_FS: {
876 		unsigned long base;
877 		if (task->thread.fsindex == FS_TLS_SEL)
878 			base = read_32bit_tls(task, FS_TLS);
879 		else if (doit)
880 			rdmsrl(MSR_FS_BASE, base);
881 		else
882 			base = task->thread.fs;
883 		ret = put_user(base, (unsigned long __user *)addr);
884 		break;
885 	}
886 	case ARCH_GET_GS: {
887 		unsigned long base;
888 		unsigned gsindex;
889 		if (task->thread.gsindex == GS_TLS_SEL)
890 			base = read_32bit_tls(task, GS_TLS);
891 		else if (doit) {
892 			asm("movl %%gs,%0" : "=r" (gsindex));
893 			if (gsindex)
894 				rdmsrl(MSR_KERNEL_GS_BASE, base);
895 			else
896 				base = task->thread.gs;
897 		}
898 		else
899 			base = task->thread.gs;
900 		ret = put_user(base, (unsigned long __user *)addr);
901 		break;
902 	}
903 
904 	default:
905 		ret = -EINVAL;
906 		break;
907 	}
908 
909 	return ret;
910 }
911 
912 long sys_arch_prctl(int code, unsigned long addr)
913 {
914 	return do_arch_prctl(current, code, addr);
915 }
916 
917 unsigned long arch_align_stack(unsigned long sp)
918 {
919 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
920 		sp -= get_random_int() % 8192;
921 	return sp & ~0xf;
922 }
923 
924 unsigned long arch_randomize_brk(struct mm_struct *mm)
925 {
926 	unsigned long range_end = mm->brk + 0x02000000;
927 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
928 }
929