xref: /linux/arch/x86/kernel/process_64.c (revision a1e58bbdc969c3fe60addca7f2729779d22a83c1)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/io.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/pda.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/idle.h>
53 
54 asmlinkage extern void ret_from_fork(void);
55 
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
57 
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
60 
61 /*
62  * Powermanagement idle function, if any..
63  */
64 void (*pm_idle)(void);
65 EXPORT_SYMBOL(pm_idle);
66 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
67 
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
69 
70 void idle_notifier_register(struct notifier_block *n)
71 {
72 	atomic_notifier_chain_register(&idle_notifier, n);
73 }
74 
75 void enter_idle(void)
76 {
77 	write_pda(isidle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (test_and_clear_bit_pda(0, isidle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 /*
98  * We use this if we don't have any better
99  * idle routine..
100  */
101 void default_idle(void)
102 {
103 	current_thread_info()->status &= ~TS_POLLING;
104 	/*
105 	 * TS_POLLING-cleared state must be visible before we
106 	 * test NEED_RESCHED:
107 	 */
108 	smp_mb();
109 	local_irq_disable();
110 	if (!need_resched()) {
111 		ktime_t t0, t1;
112 		u64 t0n, t1n;
113 
114 		t0 = ktime_get();
115 		t0n = ktime_to_ns(t0);
116 		safe_halt();	/* enables interrupts racelessly */
117 		local_irq_disable();
118 		t1 = ktime_get();
119 		t1n = ktime_to_ns(t1);
120 		sched_clock_idle_wakeup_event(t1n - t0n);
121 	}
122 	local_irq_enable();
123 	current_thread_info()->status |= TS_POLLING;
124 }
125 
126 /*
127  * On SMP it's slightly faster (but much more power-consuming!)
128  * to poll the ->need_resched flag instead of waiting for the
129  * cross-CPU IPI to arrive. Use this option with caution.
130  */
131 static void poll_idle(void)
132 {
133 	local_irq_enable();
134 	cpu_relax();
135 }
136 
137 #ifdef CONFIG_HOTPLUG_CPU
138 DECLARE_PER_CPU(int, cpu_state);
139 
140 #include <asm/nmi.h>
141 /* We halt the CPU with physical CPU hotplug */
142 static inline void play_dead(void)
143 {
144 	idle_task_exit();
145 	wbinvd();
146 	mb();
147 	/* Ack it */
148 	__get_cpu_var(cpu_state) = CPU_DEAD;
149 
150 	local_irq_disable();
151 	while (1)
152 		halt();
153 }
154 #else
155 static inline void play_dead(void)
156 {
157 	BUG();
158 }
159 #endif /* CONFIG_HOTPLUG_CPU */
160 
161 /*
162  * The idle thread. There's no useful work to be
163  * done, so just try to conserve power and have a
164  * low exit latency (ie sit in a loop waiting for
165  * somebody to say that they'd like to reschedule)
166  */
167 void cpu_idle(void)
168 {
169 	current_thread_info()->status |= TS_POLLING;
170 	/* endless idle loop with no priority at all */
171 	while (1) {
172 		tick_nohz_stop_sched_tick();
173 		while (!need_resched()) {
174 			void (*idle)(void);
175 
176 			if (__get_cpu_var(cpu_idle_state))
177 				__get_cpu_var(cpu_idle_state) = 0;
178 
179 			rmb();
180 			idle = pm_idle;
181 			if (!idle)
182 				idle = default_idle;
183 			if (cpu_is_offline(smp_processor_id()))
184 				play_dead();
185 			/*
186 			 * Idle routines should keep interrupts disabled
187 			 * from here on, until they go to idle.
188 			 * Otherwise, idle callbacks can misfire.
189 			 */
190 			local_irq_disable();
191 			enter_idle();
192 			idle();
193 			/* In many cases the interrupt that ended idle
194 			   has already called exit_idle. But some idle
195 			   loops can be woken up without interrupt. */
196 			__exit_idle();
197 		}
198 
199 		tick_nohz_restart_sched_tick();
200 		preempt_enable_no_resched();
201 		schedule();
202 		preempt_disable();
203 	}
204 }
205 
206 static void do_nothing(void *unused)
207 {
208 }
209 
210 void cpu_idle_wait(void)
211 {
212 	unsigned int cpu, this_cpu = get_cpu();
213 	cpumask_t map, tmp = current->cpus_allowed;
214 
215 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
216 	put_cpu();
217 
218 	cpus_clear(map);
219 	for_each_online_cpu(cpu) {
220 		per_cpu(cpu_idle_state, cpu) = 1;
221 		cpu_set(cpu, map);
222 	}
223 
224 	__get_cpu_var(cpu_idle_state) = 0;
225 
226 	wmb();
227 	do {
228 		ssleep(1);
229 		for_each_online_cpu(cpu) {
230 			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
231 				cpu_clear(cpu, map);
232 		}
233 		cpus_and(map, map, cpu_online_map);
234 		/*
235 		 * We waited 1 sec, if a CPU still did not call idle
236 		 * it may be because it is in idle and not waking up
237 		 * because it has nothing to do.
238 		 * Give all the remaining CPUS a kick.
239 		 */
240 		smp_call_function_mask(map, do_nothing, 0, 0);
241 	} while (!cpus_empty(map));
242 
243 	set_cpus_allowed(current, tmp);
244 }
245 EXPORT_SYMBOL_GPL(cpu_idle_wait);
246 
247 /*
248  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
249  * which can obviate IPI to trigger checking of need_resched.
250  * We execute MONITOR against need_resched and enter optimized wait state
251  * through MWAIT. Whenever someone changes need_resched, we would be woken
252  * up from MWAIT (without an IPI).
253  *
254  * New with Core Duo processors, MWAIT can take some hints based on CPU
255  * capability.
256  */
257 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
258 {
259 	if (!need_resched()) {
260 		__monitor((void *)&current_thread_info()->flags, 0, 0);
261 		smp_mb();
262 		if (!need_resched())
263 			__mwait(ax, cx);
264 	}
265 }
266 
267 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
268 static void mwait_idle(void)
269 {
270 	if (!need_resched()) {
271 		__monitor((void *)&current_thread_info()->flags, 0, 0);
272 		smp_mb();
273 		if (!need_resched())
274 			__sti_mwait(0, 0);
275 		else
276 			local_irq_enable();
277 	} else {
278 		local_irq_enable();
279 	}
280 }
281 
282 
283 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
284 {
285 	if (force_mwait)
286 		return 1;
287 	/* Any C1 states supported? */
288 	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
289 }
290 
291 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
292 {
293 	static int selected;
294 
295 	if (selected)
296 		return;
297 #ifdef CONFIG_X86_SMP
298 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
299 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
300 			" performance may degrade.\n");
301 	}
302 #endif
303 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
304 		/*
305 		 * Skip, if setup has overridden idle.
306 		 * One CPU supports mwait => All CPUs supports mwait
307 		 */
308 		if (!pm_idle) {
309 			printk(KERN_INFO "using mwait in idle threads.\n");
310 			pm_idle = mwait_idle;
311 		}
312 	}
313 	selected = 1;
314 }
315 
316 static int __init idle_setup(char *str)
317 {
318 	if (!strcmp(str, "poll")) {
319 		printk("using polling idle threads.\n");
320 		pm_idle = poll_idle;
321 	} else if (!strcmp(str, "mwait"))
322 		force_mwait = 1;
323 	else
324 		return -1;
325 
326 	boot_option_idle_override = 1;
327 	return 0;
328 }
329 early_param("idle", idle_setup);
330 
331 /* Prints also some state that isn't saved in the pt_regs */
332 void __show_regs(struct pt_regs * regs)
333 {
334 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
335 	unsigned long d0, d1, d2, d3, d6, d7;
336 	unsigned int fsindex, gsindex;
337 	unsigned int ds, cs, es;
338 
339 	printk("\n");
340 	print_modules();
341 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
342 		current->pid, current->comm, print_tainted(),
343 		init_utsname()->release,
344 		(int)strcspn(init_utsname()->version, " "),
345 		init_utsname()->version);
346 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
347 	printk_address(regs->ip, 1);
348 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
349 		regs->flags);
350 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
351 	       regs->ax, regs->bx, regs->cx);
352 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
353 	       regs->dx, regs->si, regs->di);
354 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
355 	       regs->bp, regs->r8, regs->r9);
356 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
357 	       regs->r10, regs->r11, regs->r12);
358 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
359 	       regs->r13, regs->r14, regs->r15);
360 
361 	asm("movl %%ds,%0" : "=r" (ds));
362 	asm("movl %%cs,%0" : "=r" (cs));
363 	asm("movl %%es,%0" : "=r" (es));
364 	asm("movl %%fs,%0" : "=r" (fsindex));
365 	asm("movl %%gs,%0" : "=r" (gsindex));
366 
367 	rdmsrl(MSR_FS_BASE, fs);
368 	rdmsrl(MSR_GS_BASE, gs);
369 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
370 
371 	cr0 = read_cr0();
372 	cr2 = read_cr2();
373 	cr3 = read_cr3();
374 	cr4 = read_cr4();
375 
376 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
377 	       fs,fsindex,gs,gsindex,shadowgs);
378 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
379 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
380 
381 	get_debugreg(d0, 0);
382 	get_debugreg(d1, 1);
383 	get_debugreg(d2, 2);
384 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
385 	get_debugreg(d3, 3);
386 	get_debugreg(d6, 6);
387 	get_debugreg(d7, 7);
388 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
389 }
390 
391 void show_regs(struct pt_regs *regs)
392 {
393 	printk("CPU %d:", smp_processor_id());
394 	__show_regs(regs);
395 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
396 }
397 
398 /*
399  * Free current thread data structures etc..
400  */
401 void exit_thread(void)
402 {
403 	struct task_struct *me = current;
404 	struct thread_struct *t = &me->thread;
405 
406 	if (me->thread.io_bitmap_ptr) {
407 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
408 
409 		kfree(t->io_bitmap_ptr);
410 		t->io_bitmap_ptr = NULL;
411 		clear_thread_flag(TIF_IO_BITMAP);
412 		/*
413 		 * Careful, clear this in the TSS too:
414 		 */
415 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
416 		t->io_bitmap_max = 0;
417 		put_cpu();
418 	}
419 }
420 
421 void flush_thread(void)
422 {
423 	struct task_struct *tsk = current;
424 
425 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
426 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
427 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
428 			clear_tsk_thread_flag(tsk, TIF_IA32);
429 		} else {
430 			set_tsk_thread_flag(tsk, TIF_IA32);
431 			current_thread_info()->status |= TS_COMPAT;
432 		}
433 	}
434 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
435 
436 	tsk->thread.debugreg0 = 0;
437 	tsk->thread.debugreg1 = 0;
438 	tsk->thread.debugreg2 = 0;
439 	tsk->thread.debugreg3 = 0;
440 	tsk->thread.debugreg6 = 0;
441 	tsk->thread.debugreg7 = 0;
442 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
443 	/*
444 	 * Forget coprocessor state..
445 	 */
446 	clear_fpu(tsk);
447 	clear_used_math();
448 }
449 
450 void release_thread(struct task_struct *dead_task)
451 {
452 	if (dead_task->mm) {
453 		if (dead_task->mm->context.size) {
454 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
455 					dead_task->comm,
456 					dead_task->mm->context.ldt,
457 					dead_task->mm->context.size);
458 			BUG();
459 		}
460 	}
461 }
462 
463 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
464 {
465 	struct user_desc ud = {
466 		.base_addr = addr,
467 		.limit = 0xfffff,
468 		.seg_32bit = 1,
469 		.limit_in_pages = 1,
470 		.useable = 1,
471 	};
472 	struct desc_struct *desc = t->thread.tls_array;
473 	desc += tls;
474 	fill_ldt(desc, &ud);
475 }
476 
477 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
478 {
479 	return get_desc_base(&t->thread.tls_array[tls]);
480 }
481 
482 /*
483  * This gets called before we allocate a new thread and copy
484  * the current task into it.
485  */
486 void prepare_to_copy(struct task_struct *tsk)
487 {
488 	unlazy_fpu(tsk);
489 }
490 
491 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
492 		unsigned long unused,
493 	struct task_struct * p, struct pt_regs * regs)
494 {
495 	int err;
496 	struct pt_regs * childregs;
497 	struct task_struct *me = current;
498 
499 	childregs = ((struct pt_regs *)
500 			(THREAD_SIZE + task_stack_page(p))) - 1;
501 	*childregs = *regs;
502 
503 	childregs->ax = 0;
504 	childregs->sp = sp;
505 	if (sp == ~0UL)
506 		childregs->sp = (unsigned long)childregs;
507 
508 	p->thread.sp = (unsigned long) childregs;
509 	p->thread.sp0 = (unsigned long) (childregs+1);
510 	p->thread.usersp = me->thread.usersp;
511 
512 	set_tsk_thread_flag(p, TIF_FORK);
513 
514 	p->thread.fs = me->thread.fs;
515 	p->thread.gs = me->thread.gs;
516 
517 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
518 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
519 	asm("mov %%es,%0" : "=m" (p->thread.es));
520 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
521 
522 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
523 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
524 		if (!p->thread.io_bitmap_ptr) {
525 			p->thread.io_bitmap_max = 0;
526 			return -ENOMEM;
527 		}
528 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
529 				IO_BITMAP_BYTES);
530 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
531 	}
532 
533 	/*
534 	 * Set a new TLS for the child thread?
535 	 */
536 	if (clone_flags & CLONE_SETTLS) {
537 #ifdef CONFIG_IA32_EMULATION
538 		if (test_thread_flag(TIF_IA32))
539 			err = do_set_thread_area(p, -1,
540 				(struct user_desc __user *)childregs->si, 0);
541 		else
542 #endif
543 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
544 		if (err)
545 			goto out;
546 	}
547 	err = 0;
548 out:
549 	if (err && p->thread.io_bitmap_ptr) {
550 		kfree(p->thread.io_bitmap_ptr);
551 		p->thread.io_bitmap_max = 0;
552 	}
553 	return err;
554 }
555 
556 /*
557  * This special macro can be used to load a debugging register
558  */
559 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
560 
561 static inline void __switch_to_xtra(struct task_struct *prev_p,
562 				    struct task_struct *next_p,
563 				    struct tss_struct *tss)
564 {
565 	struct thread_struct *prev, *next;
566 	unsigned long debugctl;
567 
568 	prev = &prev_p->thread,
569 	next = &next_p->thread;
570 
571 	debugctl = prev->debugctlmsr;
572 	if (next->ds_area_msr != prev->ds_area_msr) {
573 		/* we clear debugctl to make sure DS
574 		 * is not in use when we change it */
575 		debugctl = 0;
576 		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
577 		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
578 	}
579 
580 	if (next->debugctlmsr != debugctl)
581 		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
582 
583 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
584 		loaddebug(next, 0);
585 		loaddebug(next, 1);
586 		loaddebug(next, 2);
587 		loaddebug(next, 3);
588 		/* no 4 and 5 */
589 		loaddebug(next, 6);
590 		loaddebug(next, 7);
591 	}
592 
593 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
594 		/*
595 		 * Copy the relevant range of the IO bitmap.
596 		 * Normally this is 128 bytes or less:
597 		 */
598 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
599 		       max(prev->io_bitmap_max, next->io_bitmap_max));
600 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
601 		/*
602 		 * Clear any possible leftover bits:
603 		 */
604 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
605 	}
606 
607 #ifdef X86_BTS
608 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
609 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
610 
611 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
612 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
613 #endif
614 }
615 
616 /*
617  *	switch_to(x,y) should switch tasks from x to y.
618  *
619  * This could still be optimized:
620  * - fold all the options into a flag word and test it with a single test.
621  * - could test fs/gs bitsliced
622  *
623  * Kprobes not supported here. Set the probe on schedule instead.
624  */
625 struct task_struct *
626 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 {
628 	struct thread_struct *prev = &prev_p->thread,
629 				 *next = &next_p->thread;
630 	int cpu = smp_processor_id();
631 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
632 
633 	/* we're going to use this soon, after a few expensive things */
634 	if (next_p->fpu_counter>5)
635 		prefetch(&next->i387.fxsave);
636 
637 	/*
638 	 * Reload esp0, LDT and the page table pointer:
639 	 */
640 	load_sp0(tss, next);
641 
642 	/*
643 	 * Switch DS and ES.
644 	 * This won't pick up thread selector changes, but I guess that is ok.
645 	 */
646 	asm volatile("mov %%es,%0" : "=m" (prev->es));
647 	if (unlikely(next->es | prev->es))
648 		loadsegment(es, next->es);
649 
650 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
651 	if (unlikely(next->ds | prev->ds))
652 		loadsegment(ds, next->ds);
653 
654 	load_TLS(next, cpu);
655 
656 	/*
657 	 * Switch FS and GS.
658 	 */
659 	{
660 		unsigned fsindex;
661 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
662 		/* segment register != 0 always requires a reload.
663 		   also reload when it has changed.
664 		   when prev process used 64bit base always reload
665 		   to avoid an information leak. */
666 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
667 			loadsegment(fs, next->fsindex);
668 			/* check if the user used a selector != 0
669 	                 * if yes clear 64bit base, since overloaded base
670                          * is always mapped to the Null selector
671                          */
672 			if (fsindex)
673 			prev->fs = 0;
674 		}
675 		/* when next process has a 64bit base use it */
676 		if (next->fs)
677 			wrmsrl(MSR_FS_BASE, next->fs);
678 		prev->fsindex = fsindex;
679 	}
680 	{
681 		unsigned gsindex;
682 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
683 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
684 			load_gs_index(next->gsindex);
685 			if (gsindex)
686 			prev->gs = 0;
687 		}
688 		if (next->gs)
689 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
690 		prev->gsindex = gsindex;
691 	}
692 
693 	/* Must be after DS reload */
694 	unlazy_fpu(prev_p);
695 
696 	/*
697 	 * Switch the PDA and FPU contexts.
698 	 */
699 	prev->usersp = read_pda(oldrsp);
700 	write_pda(oldrsp, next->usersp);
701 	write_pda(pcurrent, next_p);
702 
703 	write_pda(kernelstack,
704 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
705 #ifdef CONFIG_CC_STACKPROTECTOR
706 	write_pda(stack_canary, next_p->stack_canary);
707 	/*
708 	 * Build time only check to make sure the stack_canary is at
709 	 * offset 40 in the pda; this is a gcc ABI requirement
710 	 */
711 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
712 #endif
713 
714 	/*
715 	 * Now maybe reload the debug registers and handle I/O bitmaps
716 	 */
717 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
718 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
719 		__switch_to_xtra(prev_p, next_p, tss);
720 
721 	/* If the task has used fpu the last 5 timeslices, just do a full
722 	 * restore of the math state immediately to avoid the trap; the
723 	 * chances of needing FPU soon are obviously high now
724 	 */
725 	if (next_p->fpu_counter>5)
726 		math_state_restore();
727 	return prev_p;
728 }
729 
730 /*
731  * sys_execve() executes a new program.
732  */
733 asmlinkage
734 long sys_execve(char __user *name, char __user * __user *argv,
735 		char __user * __user *envp, struct pt_regs *regs)
736 {
737 	long error;
738 	char * filename;
739 
740 	filename = getname(name);
741 	error = PTR_ERR(filename);
742 	if (IS_ERR(filename))
743 		return error;
744 	error = do_execve(filename, argv, envp, regs);
745 	putname(filename);
746 	return error;
747 }
748 
749 void set_personality_64bit(void)
750 {
751 	/* inherit personality from parent */
752 
753 	/* Make sure to be in 64bit mode */
754 	clear_thread_flag(TIF_IA32);
755 
756 	/* TBD: overwrites user setup. Should have two bits.
757 	   But 64bit processes have always behaved this way,
758 	   so it's not too bad. The main problem is just that
759 	   32bit childs are affected again. */
760 	current->personality &= ~READ_IMPLIES_EXEC;
761 }
762 
763 asmlinkage long sys_fork(struct pt_regs *regs)
764 {
765 	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
766 }
767 
768 asmlinkage long
769 sys_clone(unsigned long clone_flags, unsigned long newsp,
770 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
771 {
772 	if (!newsp)
773 		newsp = regs->sp;
774 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
775 }
776 
777 /*
778  * This is trivial, and on the face of it looks like it
779  * could equally well be done in user mode.
780  *
781  * Not so, for quite unobvious reasons - register pressure.
782  * In user mode vfork() cannot have a stack frame, and if
783  * done by calling the "clone()" system call directly, you
784  * do not have enough call-clobbered registers to hold all
785  * the information you need.
786  */
787 asmlinkage long sys_vfork(struct pt_regs *regs)
788 {
789 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
790 		    NULL, NULL);
791 }
792 
793 unsigned long get_wchan(struct task_struct *p)
794 {
795 	unsigned long stack;
796 	u64 fp,ip;
797 	int count = 0;
798 
799 	if (!p || p == current || p->state==TASK_RUNNING)
800 		return 0;
801 	stack = (unsigned long)task_stack_page(p);
802 	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
803 		return 0;
804 	fp = *(u64 *)(p->thread.sp);
805 	do {
806 		if (fp < (unsigned long)stack ||
807 		    fp > (unsigned long)stack+THREAD_SIZE)
808 			return 0;
809 		ip = *(u64 *)(fp+8);
810 		if (!in_sched_functions(ip))
811 			return ip;
812 		fp = *(u64 *)fp;
813 	} while (count++ < 16);
814 	return 0;
815 }
816 
817 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
818 {
819 	int ret = 0;
820 	int doit = task == current;
821 	int cpu;
822 
823 	switch (code) {
824 	case ARCH_SET_GS:
825 		if (addr >= TASK_SIZE_OF(task))
826 			return -EPERM;
827 		cpu = get_cpu();
828 		/* handle small bases via the GDT because that's faster to
829 		   switch. */
830 		if (addr <= 0xffffffff) {
831 			set_32bit_tls(task, GS_TLS, addr);
832 			if (doit) {
833 				load_TLS(&task->thread, cpu);
834 				load_gs_index(GS_TLS_SEL);
835 			}
836 			task->thread.gsindex = GS_TLS_SEL;
837 			task->thread.gs = 0;
838 		} else {
839 			task->thread.gsindex = 0;
840 			task->thread.gs = addr;
841 			if (doit) {
842 				load_gs_index(0);
843 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
844 			}
845 		}
846 		put_cpu();
847 		break;
848 	case ARCH_SET_FS:
849 		/* Not strictly needed for fs, but do it for symmetry
850 		   with gs */
851 		if (addr >= TASK_SIZE_OF(task))
852 			return -EPERM;
853 		cpu = get_cpu();
854 		/* handle small bases via the GDT because that's faster to
855 		   switch. */
856 		if (addr <= 0xffffffff) {
857 			set_32bit_tls(task, FS_TLS, addr);
858 			if (doit) {
859 				load_TLS(&task->thread, cpu);
860 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
861 			}
862 			task->thread.fsindex = FS_TLS_SEL;
863 			task->thread.fs = 0;
864 		} else {
865 			task->thread.fsindex = 0;
866 			task->thread.fs = addr;
867 			if (doit) {
868 				/* set the selector to 0 to not confuse
869 				   __switch_to */
870 				asm volatile("movl %0,%%fs" :: "r" (0));
871 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
872 			}
873 		}
874 		put_cpu();
875 		break;
876 	case ARCH_GET_FS: {
877 		unsigned long base;
878 		if (task->thread.fsindex == FS_TLS_SEL)
879 			base = read_32bit_tls(task, FS_TLS);
880 		else if (doit)
881 			rdmsrl(MSR_FS_BASE, base);
882 		else
883 			base = task->thread.fs;
884 		ret = put_user(base, (unsigned long __user *)addr);
885 		break;
886 	}
887 	case ARCH_GET_GS: {
888 		unsigned long base;
889 		unsigned gsindex;
890 		if (task->thread.gsindex == GS_TLS_SEL)
891 			base = read_32bit_tls(task, GS_TLS);
892 		else if (doit) {
893 			asm("movl %%gs,%0" : "=r" (gsindex));
894 			if (gsindex)
895 				rdmsrl(MSR_KERNEL_GS_BASE, base);
896 			else
897 				base = task->thread.gs;
898 		}
899 		else
900 			base = task->thread.gs;
901 		ret = put_user(base, (unsigned long __user *)addr);
902 		break;
903 	}
904 
905 	default:
906 		ret = -EINVAL;
907 		break;
908 	}
909 
910 	return ret;
911 }
912 
913 long sys_arch_prctl(int code, unsigned long addr)
914 {
915 	return do_arch_prctl(current, code, addr);
916 }
917 
918 unsigned long arch_align_stack(unsigned long sp)
919 {
920 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
921 		sp -= get_random_int() % 8192;
922 	return sp & ~0xf;
923 }
924 
925 unsigned long arch_randomize_brk(struct mm_struct *mm)
926 {
927 	unsigned long range_end = mm->brk + 0x02000000;
928 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
929 }
930