xref: /linux/arch/x86/kernel/process_64.c (revision ed3174d93c342b8b2eeba6bbd124707d55304a7b)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/io.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/pda.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/idle.h>
53 
54 asmlinkage extern void ret_from_fork(void);
55 
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
57 
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
60 
61 /*
62  * Powermanagement idle function, if any..
63  */
64 void (*pm_idle)(void);
65 EXPORT_SYMBOL(pm_idle);
66 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
67 
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
69 
70 void idle_notifier_register(struct notifier_block *n)
71 {
72 	atomic_notifier_chain_register(&idle_notifier, n);
73 }
74 
75 void enter_idle(void)
76 {
77 	write_pda(isidle, 1);
78 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79 }
80 
81 static void __exit_idle(void)
82 {
83 	if (test_and_clear_bit_pda(0, isidle) == 0)
84 		return;
85 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86 }
87 
88 /* Called from interrupts to signify idle end */
89 void exit_idle(void)
90 {
91 	/* idle loop has pid 0 */
92 	if (current->pid)
93 		return;
94 	__exit_idle();
95 }
96 
97 /*
98  * We use this if we don't have any better
99  * idle routine..
100  */
101 void default_idle(void)
102 {
103 	current_thread_info()->status &= ~TS_POLLING;
104 	/*
105 	 * TS_POLLING-cleared state must be visible before we
106 	 * test NEED_RESCHED:
107 	 */
108 	smp_mb();
109 	local_irq_disable();
110 	if (!need_resched()) {
111 		ktime_t t0, t1;
112 		u64 t0n, t1n;
113 
114 		t0 = ktime_get();
115 		t0n = ktime_to_ns(t0);
116 		safe_halt();	/* enables interrupts racelessly */
117 		local_irq_disable();
118 		t1 = ktime_get();
119 		t1n = ktime_to_ns(t1);
120 		sched_clock_idle_wakeup_event(t1n - t0n);
121 	}
122 	local_irq_enable();
123 	current_thread_info()->status |= TS_POLLING;
124 }
125 
126 /*
127  * On SMP it's slightly faster (but much more power-consuming!)
128  * to poll the ->need_resched flag instead of waiting for the
129  * cross-CPU IPI to arrive. Use this option with caution.
130  */
131 static void poll_idle(void)
132 {
133 	local_irq_enable();
134 	cpu_relax();
135 }
136 
137 #ifdef CONFIG_HOTPLUG_CPU
138 DECLARE_PER_CPU(int, cpu_state);
139 
140 #include <asm/nmi.h>
141 /* We halt the CPU with physical CPU hotplug */
142 static inline void play_dead(void)
143 {
144 	idle_task_exit();
145 	wbinvd();
146 	mb();
147 	/* Ack it */
148 	__get_cpu_var(cpu_state) = CPU_DEAD;
149 
150 	local_irq_disable();
151 	while (1)
152 		halt();
153 }
154 #else
155 static inline void play_dead(void)
156 {
157 	BUG();
158 }
159 #endif /* CONFIG_HOTPLUG_CPU */
160 
161 /*
162  * The idle thread. There's no useful work to be
163  * done, so just try to conserve power and have a
164  * low exit latency (ie sit in a loop waiting for
165  * somebody to say that they'd like to reschedule)
166  */
167 void cpu_idle(void)
168 {
169 	current_thread_info()->status |= TS_POLLING;
170 	/* endless idle loop with no priority at all */
171 	while (1) {
172 		tick_nohz_stop_sched_tick();
173 		while (!need_resched()) {
174 			void (*idle)(void);
175 
176 			if (__get_cpu_var(cpu_idle_state))
177 				__get_cpu_var(cpu_idle_state) = 0;
178 
179 			rmb();
180 			idle = pm_idle;
181 			if (!idle)
182 				idle = default_idle;
183 			if (cpu_is_offline(smp_processor_id()))
184 				play_dead();
185 			/*
186 			 * Idle routines should keep interrupts disabled
187 			 * from here on, until they go to idle.
188 			 * Otherwise, idle callbacks can misfire.
189 			 */
190 			local_irq_disable();
191 			enter_idle();
192 			idle();
193 			/* In many cases the interrupt that ended idle
194 			   has already called exit_idle. But some idle
195 			   loops can be woken up without interrupt. */
196 			__exit_idle();
197 		}
198 
199 		tick_nohz_restart_sched_tick();
200 		preempt_enable_no_resched();
201 		schedule();
202 		preempt_disable();
203 	}
204 }
205 
206 static void do_nothing(void *unused)
207 {
208 }
209 
210 void cpu_idle_wait(void)
211 {
212 	unsigned int cpu, this_cpu = get_cpu();
213 	cpumask_t map, tmp = current->cpus_allowed;
214 
215 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
216 	put_cpu();
217 
218 	cpus_clear(map);
219 	for_each_online_cpu(cpu) {
220 		per_cpu(cpu_idle_state, cpu) = 1;
221 		cpu_set(cpu, map);
222 	}
223 
224 	__get_cpu_var(cpu_idle_state) = 0;
225 
226 	wmb();
227 	do {
228 		ssleep(1);
229 		for_each_online_cpu(cpu) {
230 			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
231 				cpu_clear(cpu, map);
232 		}
233 		cpus_and(map, map, cpu_online_map);
234 		/*
235 		 * We waited 1 sec, if a CPU still did not call idle
236 		 * it may be because it is in idle and not waking up
237 		 * because it has nothing to do.
238 		 * Give all the remaining CPUS a kick.
239 		 */
240 		smp_call_function_mask(map, do_nothing, 0, 0);
241 	} while (!cpus_empty(map));
242 
243 	set_cpus_allowed(current, tmp);
244 }
245 EXPORT_SYMBOL_GPL(cpu_idle_wait);
246 
247 /*
248  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
249  * which can obviate IPI to trigger checking of need_resched.
250  * We execute MONITOR against need_resched and enter optimized wait state
251  * through MWAIT. Whenever someone changes need_resched, we would be woken
252  * up from MWAIT (without an IPI).
253  *
254  * New with Core Duo processors, MWAIT can take some hints based on CPU
255  * capability.
256  */
257 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
258 {
259 	if (!need_resched()) {
260 		__monitor((void *)&current_thread_info()->flags, 0, 0);
261 		smp_mb();
262 		if (!need_resched())
263 			__mwait(ax, cx);
264 	}
265 }
266 
267 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
268 static void mwait_idle(void)
269 {
270 	if (!need_resched()) {
271 		__monitor((void *)&current_thread_info()->flags, 0, 0);
272 		smp_mb();
273 		if (!need_resched())
274 			__sti_mwait(0, 0);
275 		else
276 			local_irq_enable();
277 	} else {
278 		local_irq_enable();
279 	}
280 }
281 
282 
283 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
284 {
285 	if (force_mwait)
286 		return 1;
287 	/* Any C1 states supported? */
288 	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
289 }
290 
291 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
292 {
293 	static int selected;
294 
295 	if (selected)
296 		return;
297 #ifdef CONFIG_X86_SMP
298 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
299 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
300 			" performance may degrade.\n");
301 	}
302 #endif
303 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
304 		/*
305 		 * Skip, if setup has overridden idle.
306 		 * One CPU supports mwait => All CPUs supports mwait
307 		 */
308 		if (!pm_idle) {
309 			printk(KERN_INFO "using mwait in idle threads.\n");
310 			pm_idle = mwait_idle;
311 		}
312 	}
313 	selected = 1;
314 }
315 
316 static int __init idle_setup(char *str)
317 {
318 	if (!strcmp(str, "poll")) {
319 		printk("using polling idle threads.\n");
320 		pm_idle = poll_idle;
321 	} else if (!strcmp(str, "mwait"))
322 		force_mwait = 1;
323 	else
324 		return -1;
325 
326 	boot_option_idle_override = 1;
327 	return 0;
328 }
329 early_param("idle", idle_setup);
330 
331 /* Prints also some state that isn't saved in the pt_regs */
332 void __show_regs(struct pt_regs * regs)
333 {
334 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
335 	unsigned long d0, d1, d2, d3, d6, d7;
336 	unsigned int fsindex, gsindex;
337 	unsigned int ds, cs, es;
338 
339 	printk("\n");
340 	print_modules();
341 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
342 		current->pid, current->comm, print_tainted(),
343 		init_utsname()->release,
344 		(int)strcspn(init_utsname()->version, " "),
345 		init_utsname()->version);
346 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
347 	printk_address(regs->ip, 1);
348 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
349 		regs->flags);
350 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
351 	       regs->ax, regs->bx, regs->cx);
352 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
353 	       regs->dx, regs->si, regs->di);
354 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
355 	       regs->bp, regs->r8, regs->r9);
356 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
357 	       regs->r10, regs->r11, regs->r12);
358 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
359 	       regs->r13, regs->r14, regs->r15);
360 
361 	asm("movl %%ds,%0" : "=r" (ds));
362 	asm("movl %%cs,%0" : "=r" (cs));
363 	asm("movl %%es,%0" : "=r" (es));
364 	asm("movl %%fs,%0" : "=r" (fsindex));
365 	asm("movl %%gs,%0" : "=r" (gsindex));
366 
367 	rdmsrl(MSR_FS_BASE, fs);
368 	rdmsrl(MSR_GS_BASE, gs);
369 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
370 
371 	cr0 = read_cr0();
372 	cr2 = read_cr2();
373 	cr3 = read_cr3();
374 	cr4 = read_cr4();
375 
376 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
377 	       fs,fsindex,gs,gsindex,shadowgs);
378 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
379 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
380 
381 	get_debugreg(d0, 0);
382 	get_debugreg(d1, 1);
383 	get_debugreg(d2, 2);
384 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
385 	get_debugreg(d3, 3);
386 	get_debugreg(d6, 6);
387 	get_debugreg(d7, 7);
388 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
389 }
390 
391 void show_regs(struct pt_regs *regs)
392 {
393 	printk("CPU %d:", smp_processor_id());
394 	__show_regs(regs);
395 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
396 }
397 
398 /*
399  * Free current thread data structures etc..
400  */
401 void exit_thread(void)
402 {
403 	struct task_struct *me = current;
404 	struct thread_struct *t = &me->thread;
405 
406 	if (me->thread.io_bitmap_ptr) {
407 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
408 
409 		kfree(t->io_bitmap_ptr);
410 		t->io_bitmap_ptr = NULL;
411 		clear_thread_flag(TIF_IO_BITMAP);
412 		/*
413 		 * Careful, clear this in the TSS too:
414 		 */
415 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
416 		t->io_bitmap_max = 0;
417 		put_cpu();
418 	}
419 }
420 
421 void flush_thread(void)
422 {
423 	struct task_struct *tsk = current;
424 
425 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
426 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
427 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
428 			clear_tsk_thread_flag(tsk, TIF_IA32);
429 		} else {
430 			set_tsk_thread_flag(tsk, TIF_IA32);
431 			current_thread_info()->status |= TS_COMPAT;
432 		}
433 	}
434 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
435 
436 	tsk->thread.debugreg0 = 0;
437 	tsk->thread.debugreg1 = 0;
438 	tsk->thread.debugreg2 = 0;
439 	tsk->thread.debugreg3 = 0;
440 	tsk->thread.debugreg6 = 0;
441 	tsk->thread.debugreg7 = 0;
442 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
443 	/*
444 	 * Forget coprocessor state..
445 	 */
446 	clear_fpu(tsk);
447 	clear_used_math();
448 }
449 
450 void release_thread(struct task_struct *dead_task)
451 {
452 	if (dead_task->mm) {
453 		if (dead_task->mm->context.size) {
454 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
455 					dead_task->comm,
456 					dead_task->mm->context.ldt,
457 					dead_task->mm->context.size);
458 			BUG();
459 		}
460 	}
461 }
462 
463 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
464 {
465 	struct user_desc ud = {
466 		.base_addr = addr,
467 		.limit = 0xfffff,
468 		.seg_32bit = 1,
469 		.limit_in_pages = 1,
470 		.useable = 1,
471 	};
472 	struct desc_struct *desc = t->thread.tls_array;
473 	desc += tls;
474 	fill_ldt(desc, &ud);
475 }
476 
477 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
478 {
479 	return get_desc_base(&t->thread.tls_array[tls]);
480 }
481 
482 /*
483  * This gets called before we allocate a new thread and copy
484  * the current task into it.
485  */
486 void prepare_to_copy(struct task_struct *tsk)
487 {
488 	unlazy_fpu(tsk);
489 }
490 
491 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
492 		unsigned long unused,
493 	struct task_struct * p, struct pt_regs * regs)
494 {
495 	int err;
496 	struct pt_regs * childregs;
497 	struct task_struct *me = current;
498 
499 	childregs = ((struct pt_regs *)
500 			(THREAD_SIZE + task_stack_page(p))) - 1;
501 	*childregs = *regs;
502 
503 	childregs->ax = 0;
504 	childregs->sp = sp;
505 	if (sp == ~0UL)
506 		childregs->sp = (unsigned long)childregs;
507 
508 	p->thread.sp = (unsigned long) childregs;
509 	p->thread.sp0 = (unsigned long) (childregs+1);
510 	p->thread.usersp = me->thread.usersp;
511 
512 	set_tsk_thread_flag(p, TIF_FORK);
513 
514 	p->thread.fs = me->thread.fs;
515 	p->thread.gs = me->thread.gs;
516 
517 	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
518 	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
519 	asm("mov %%es,%0" : "=m" (p->thread.es));
520 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
521 
522 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
523 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
524 		if (!p->thread.io_bitmap_ptr) {
525 			p->thread.io_bitmap_max = 0;
526 			return -ENOMEM;
527 		}
528 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
529 				IO_BITMAP_BYTES);
530 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
531 	}
532 
533 	/*
534 	 * Set a new TLS for the child thread?
535 	 */
536 	if (clone_flags & CLONE_SETTLS) {
537 #ifdef CONFIG_IA32_EMULATION
538 		if (test_thread_flag(TIF_IA32))
539 			err = do_set_thread_area(p, -1,
540 				(struct user_desc __user *)childregs->si, 0);
541 		else
542 #endif
543 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
544 		if (err)
545 			goto out;
546 	}
547 	err = 0;
548 out:
549 	if (err && p->thread.io_bitmap_ptr) {
550 		kfree(p->thread.io_bitmap_ptr);
551 		p->thread.io_bitmap_max = 0;
552 	}
553 	return err;
554 }
555 
556 /*
557  * This special macro can be used to load a debugging register
558  */
559 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
560 
561 static inline void __switch_to_xtra(struct task_struct *prev_p,
562 				    struct task_struct *next_p,
563 				    struct tss_struct *tss)
564 {
565 	struct thread_struct *prev, *next;
566 	unsigned long debugctl;
567 
568 	prev = &prev_p->thread,
569 	next = &next_p->thread;
570 
571 	debugctl = prev->debugctlmsr;
572 	if (next->ds_area_msr != prev->ds_area_msr) {
573 		/* we clear debugctl to make sure DS
574 		 * is not in use when we change it */
575 		debugctl = 0;
576 		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
577 		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
578 	}
579 
580 	if (next->debugctlmsr != debugctl)
581 		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
582 
583 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
584 		loaddebug(next, 0);
585 		loaddebug(next, 1);
586 		loaddebug(next, 2);
587 		loaddebug(next, 3);
588 		/* no 4 and 5 */
589 		loaddebug(next, 6);
590 		loaddebug(next, 7);
591 	}
592 
593 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
594 		/*
595 		 * Copy the relevant range of the IO bitmap.
596 		 * Normally this is 128 bytes or less:
597 		 */
598 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
599 		       max(prev->io_bitmap_max, next->io_bitmap_max));
600 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
601 		/*
602 		 * Clear any possible leftover bits:
603 		 */
604 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
605 	}
606 
607 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
608 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
609 
610 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
611 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
612 }
613 
614 /*
615  *	switch_to(x,y) should switch tasks from x to y.
616  *
617  * This could still be optimized:
618  * - fold all the options into a flag word and test it with a single test.
619  * - could test fs/gs bitsliced
620  *
621  * Kprobes not supported here. Set the probe on schedule instead.
622  */
623 struct task_struct *
624 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
625 {
626 	struct thread_struct *prev = &prev_p->thread,
627 				 *next = &next_p->thread;
628 	int cpu = smp_processor_id();
629 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
630 
631 	/* we're going to use this soon, after a few expensive things */
632 	if (next_p->fpu_counter>5)
633 		prefetch(&next->i387.fxsave);
634 
635 	/*
636 	 * Reload esp0, LDT and the page table pointer:
637 	 */
638 	load_sp0(tss, next);
639 
640 	/*
641 	 * Switch DS and ES.
642 	 * This won't pick up thread selector changes, but I guess that is ok.
643 	 */
644 	asm volatile("mov %%es,%0" : "=m" (prev->es));
645 	if (unlikely(next->es | prev->es))
646 		loadsegment(es, next->es);
647 
648 	asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
649 	if (unlikely(next->ds | prev->ds))
650 		loadsegment(ds, next->ds);
651 
652 	load_TLS(next, cpu);
653 
654 	/*
655 	 * Switch FS and GS.
656 	 */
657 	{
658 		unsigned fsindex;
659 		asm volatile("movl %%fs,%0" : "=r" (fsindex));
660 		/* segment register != 0 always requires a reload.
661 		   also reload when it has changed.
662 		   when prev process used 64bit base always reload
663 		   to avoid an information leak. */
664 		if (unlikely(fsindex | next->fsindex | prev->fs)) {
665 			loadsegment(fs, next->fsindex);
666 			/* check if the user used a selector != 0
667 	                 * if yes clear 64bit base, since overloaded base
668                          * is always mapped to the Null selector
669                          */
670 			if (fsindex)
671 			prev->fs = 0;
672 		}
673 		/* when next process has a 64bit base use it */
674 		if (next->fs)
675 			wrmsrl(MSR_FS_BASE, next->fs);
676 		prev->fsindex = fsindex;
677 	}
678 	{
679 		unsigned gsindex;
680 		asm volatile("movl %%gs,%0" : "=r" (gsindex));
681 		if (unlikely(gsindex | next->gsindex | prev->gs)) {
682 			load_gs_index(next->gsindex);
683 			if (gsindex)
684 			prev->gs = 0;
685 		}
686 		if (next->gs)
687 			wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
688 		prev->gsindex = gsindex;
689 	}
690 
691 	/* Must be after DS reload */
692 	unlazy_fpu(prev_p);
693 
694 	/*
695 	 * Switch the PDA and FPU contexts.
696 	 */
697 	prev->usersp = read_pda(oldrsp);
698 	write_pda(oldrsp, next->usersp);
699 	write_pda(pcurrent, next_p);
700 
701 	write_pda(kernelstack,
702 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
703 #ifdef CONFIG_CC_STACKPROTECTOR
704 	write_pda(stack_canary, next_p->stack_canary);
705 	/*
706 	 * Build time only check to make sure the stack_canary is at
707 	 * offset 40 in the pda; this is a gcc ABI requirement
708 	 */
709 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
710 #endif
711 
712 	/*
713 	 * Now maybe reload the debug registers and handle I/O bitmaps
714 	 */
715 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
716 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
717 		__switch_to_xtra(prev_p, next_p, tss);
718 
719 	/* If the task has used fpu the last 5 timeslices, just do a full
720 	 * restore of the math state immediately to avoid the trap; the
721 	 * chances of needing FPU soon are obviously high now
722 	 */
723 	if (next_p->fpu_counter>5)
724 		math_state_restore();
725 	return prev_p;
726 }
727 
728 /*
729  * sys_execve() executes a new program.
730  */
731 asmlinkage
732 long sys_execve(char __user *name, char __user * __user *argv,
733 		char __user * __user *envp, struct pt_regs regs)
734 {
735 	long error;
736 	char * filename;
737 
738 	filename = getname(name);
739 	error = PTR_ERR(filename);
740 	if (IS_ERR(filename))
741 		return error;
742 	error = do_execve(filename, argv, envp, &regs);
743 	putname(filename);
744 	return error;
745 }
746 
747 void set_personality_64bit(void)
748 {
749 	/* inherit personality from parent */
750 
751 	/* Make sure to be in 64bit mode */
752 	clear_thread_flag(TIF_IA32);
753 
754 	/* TBD: overwrites user setup. Should have two bits.
755 	   But 64bit processes have always behaved this way,
756 	   so it's not too bad. The main problem is just that
757 	   32bit childs are affected again. */
758 	current->personality &= ~READ_IMPLIES_EXEC;
759 }
760 
761 asmlinkage long sys_fork(struct pt_regs *regs)
762 {
763 	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
764 }
765 
766 asmlinkage long
767 sys_clone(unsigned long clone_flags, unsigned long newsp,
768 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
769 {
770 	if (!newsp)
771 		newsp = regs->sp;
772 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
773 }
774 
775 /*
776  * This is trivial, and on the face of it looks like it
777  * could equally well be done in user mode.
778  *
779  * Not so, for quite unobvious reasons - register pressure.
780  * In user mode vfork() cannot have a stack frame, and if
781  * done by calling the "clone()" system call directly, you
782  * do not have enough call-clobbered registers to hold all
783  * the information you need.
784  */
785 asmlinkage long sys_vfork(struct pt_regs *regs)
786 {
787 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
788 		    NULL, NULL);
789 }
790 
791 unsigned long get_wchan(struct task_struct *p)
792 {
793 	unsigned long stack;
794 	u64 fp,ip;
795 	int count = 0;
796 
797 	if (!p || p == current || p->state==TASK_RUNNING)
798 		return 0;
799 	stack = (unsigned long)task_stack_page(p);
800 	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
801 		return 0;
802 	fp = *(u64 *)(p->thread.sp);
803 	do {
804 		if (fp < (unsigned long)stack ||
805 		    fp > (unsigned long)stack+THREAD_SIZE)
806 			return 0;
807 		ip = *(u64 *)(fp+8);
808 		if (!in_sched_functions(ip))
809 			return ip;
810 		fp = *(u64 *)fp;
811 	} while (count++ < 16);
812 	return 0;
813 }
814 
815 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
816 {
817 	int ret = 0;
818 	int doit = task == current;
819 	int cpu;
820 
821 	switch (code) {
822 	case ARCH_SET_GS:
823 		if (addr >= TASK_SIZE_OF(task))
824 			return -EPERM;
825 		cpu = get_cpu();
826 		/* handle small bases via the GDT because that's faster to
827 		   switch. */
828 		if (addr <= 0xffffffff) {
829 			set_32bit_tls(task, GS_TLS, addr);
830 			if (doit) {
831 				load_TLS(&task->thread, cpu);
832 				load_gs_index(GS_TLS_SEL);
833 			}
834 			task->thread.gsindex = GS_TLS_SEL;
835 			task->thread.gs = 0;
836 		} else {
837 			task->thread.gsindex = 0;
838 			task->thread.gs = addr;
839 			if (doit) {
840 				load_gs_index(0);
841 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
842 			}
843 		}
844 		put_cpu();
845 		break;
846 	case ARCH_SET_FS:
847 		/* Not strictly needed for fs, but do it for symmetry
848 		   with gs */
849 		if (addr >= TASK_SIZE_OF(task))
850 			return -EPERM;
851 		cpu = get_cpu();
852 		/* handle small bases via the GDT because that's faster to
853 		   switch. */
854 		if (addr <= 0xffffffff) {
855 			set_32bit_tls(task, FS_TLS, addr);
856 			if (doit) {
857 				load_TLS(&task->thread, cpu);
858 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
859 			}
860 			task->thread.fsindex = FS_TLS_SEL;
861 			task->thread.fs = 0;
862 		} else {
863 			task->thread.fsindex = 0;
864 			task->thread.fs = addr;
865 			if (doit) {
866 				/* set the selector to 0 to not confuse
867 				   __switch_to */
868 				asm volatile("movl %0,%%fs" :: "r" (0));
869 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
870 			}
871 		}
872 		put_cpu();
873 		break;
874 	case ARCH_GET_FS: {
875 		unsigned long base;
876 		if (task->thread.fsindex == FS_TLS_SEL)
877 			base = read_32bit_tls(task, FS_TLS);
878 		else if (doit)
879 			rdmsrl(MSR_FS_BASE, base);
880 		else
881 			base = task->thread.fs;
882 		ret = put_user(base, (unsigned long __user *)addr);
883 		break;
884 	}
885 	case ARCH_GET_GS: {
886 		unsigned long base;
887 		unsigned gsindex;
888 		if (task->thread.gsindex == GS_TLS_SEL)
889 			base = read_32bit_tls(task, GS_TLS);
890 		else if (doit) {
891 			asm("movl %%gs,%0" : "=r" (gsindex));
892 			if (gsindex)
893 				rdmsrl(MSR_KERNEL_GS_BASE, base);
894 			else
895 				base = task->thread.gs;
896 		}
897 		else
898 			base = task->thread.gs;
899 		ret = put_user(base, (unsigned long __user *)addr);
900 		break;
901 	}
902 
903 	default:
904 		ret = -EINVAL;
905 		break;
906 	}
907 
908 	return ret;
909 }
910 
911 long sys_arch_prctl(int code, unsigned long addr)
912 {
913 	return do_arch_prctl(current, code, addr);
914 }
915 
916 unsigned long arch_align_stack(unsigned long sp)
917 {
918 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
919 		sp -= get_random_int() % 8192;
920 	return sp & ~0xf;
921 }
922 
923 unsigned long arch_randomize_brk(struct mm_struct *mm)
924 {
925 	unsigned long range_end = mm->brk + 0x02000000;
926 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
927 }
928