xref: /linux/arch/x86/kernel/process_64.c (revision c0c9209ddd96bc4f1d70a8b9958710671e076080)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
40 
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/pda.h>
49 #include <asm/prctl.h>
50 #include <asm/desc.h>
51 #include <asm/proto.h>
52 #include <asm/ia32.h>
53 #include <asm/idle.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
60 
61 void idle_notifier_register(struct notifier_block *n)
62 {
63 	atomic_notifier_chain_register(&idle_notifier, n);
64 }
65 
66 void enter_idle(void)
67 {
68 	write_pda(isidle, 1);
69 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
70 }
71 
72 static void __exit_idle(void)
73 {
74 	if (test_and_clear_bit_pda(0, isidle) == 0)
75 		return;
76 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
77 }
78 
79 /* Called from interrupts to signify idle end */
80 void exit_idle(void)
81 {
82 	/* idle loop has pid 0 */
83 	if (current->pid)
84 		return;
85 	__exit_idle();
86 }
87 
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state);
90 
91 #include <asm/nmi.h>
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
94 {
95 	idle_task_exit();
96 	c1e_remove_cpu(raw_smp_processor_id());
97 
98 	mb();
99 	/* Ack it */
100 	__get_cpu_var(cpu_state) = CPU_DEAD;
101 
102 	local_irq_disable();
103 	/* mask all interrupts, flush any and all caches, and halt */
104 	wbinvd_halt();
105 }
106 #else
107 static inline void play_dead(void)
108 {
109 	BUG();
110 }
111 #endif /* CONFIG_HOTPLUG_CPU */
112 
113 /*
114  * The idle thread. There's no useful work to be
115  * done, so just try to conserve power and have a
116  * low exit latency (ie sit in a loop waiting for
117  * somebody to say that they'd like to reschedule)
118  */
119 void cpu_idle(void)
120 {
121 	current_thread_info()->status |= TS_POLLING;
122 	/* endless idle loop with no priority at all */
123 	while (1) {
124 		tick_nohz_stop_sched_tick(1);
125 		while (!need_resched()) {
126 
127 			rmb();
128 
129 			if (cpu_is_offline(smp_processor_id()))
130 				play_dead();
131 			/*
132 			 * Idle routines should keep interrupts disabled
133 			 * from here on, until they go to idle.
134 			 * Otherwise, idle callbacks can misfire.
135 			 */
136 			local_irq_disable();
137 			enter_idle();
138 			/* Don't trace irqs off for idle */
139 			stop_critical_timings();
140 			pm_idle();
141 			start_critical_timings();
142 			/* In many cases the interrupt that ended idle
143 			   has already called exit_idle. But some idle
144 			   loops can be woken up without interrupt. */
145 			__exit_idle();
146 		}
147 
148 		tick_nohz_restart_sched_tick();
149 		preempt_enable_no_resched();
150 		schedule();
151 		preempt_disable();
152 	}
153 }
154 
155 /* Prints also some state that isn't saved in the pt_regs */
156 void __show_regs(struct pt_regs * regs)
157 {
158 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 	unsigned long d0, d1, d2, d3, d6, d7;
160 	unsigned int fsindex, gsindex;
161 	unsigned int ds, cs, es;
162 
163 	printk("\n");
164 	print_modules();
165 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
166 		current->pid, current->comm, print_tainted(),
167 		init_utsname()->release,
168 		(int)strcspn(init_utsname()->version, " "),
169 		init_utsname()->version);
170 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
171 	printk_address(regs->ip, 1);
172 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
173 		regs->flags);
174 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 	       regs->ax, regs->bx, regs->cx);
176 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 	       regs->dx, regs->si, regs->di);
178 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
179 	       regs->bp, regs->r8, regs->r9);
180 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
181 	       regs->r10, regs->r11, regs->r12);
182 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
183 	       regs->r13, regs->r14, regs->r15);
184 
185 	asm("movl %%ds,%0" : "=r" (ds));
186 	asm("movl %%cs,%0" : "=r" (cs));
187 	asm("movl %%es,%0" : "=r" (es));
188 	asm("movl %%fs,%0" : "=r" (fsindex));
189 	asm("movl %%gs,%0" : "=r" (gsindex));
190 
191 	rdmsrl(MSR_FS_BASE, fs);
192 	rdmsrl(MSR_GS_BASE, gs);
193 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
194 
195 	cr0 = read_cr0();
196 	cr2 = read_cr2();
197 	cr3 = read_cr3();
198 	cr4 = read_cr4();
199 
200 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
201 	       fs,fsindex,gs,gsindex,shadowgs);
202 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
203 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
204 
205 	get_debugreg(d0, 0);
206 	get_debugreg(d1, 1);
207 	get_debugreg(d2, 2);
208 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
209 	get_debugreg(d3, 3);
210 	get_debugreg(d6, 6);
211 	get_debugreg(d7, 7);
212 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
213 }
214 
215 void show_regs(struct pt_regs *regs)
216 {
217 	printk("CPU %d:", smp_processor_id());
218 	__show_regs(regs);
219 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
220 }
221 
222 /*
223  * Free current thread data structures etc..
224  */
225 void exit_thread(void)
226 {
227 	struct task_struct *me = current;
228 	struct thread_struct *t = &me->thread;
229 
230 	if (me->thread.io_bitmap_ptr) {
231 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
232 
233 		kfree(t->io_bitmap_ptr);
234 		t->io_bitmap_ptr = NULL;
235 		clear_thread_flag(TIF_IO_BITMAP);
236 		/*
237 		 * Careful, clear this in the TSS too:
238 		 */
239 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
240 		t->io_bitmap_max = 0;
241 		put_cpu();
242 	}
243 }
244 
245 void flush_thread(void)
246 {
247 	struct task_struct *tsk = current;
248 
249 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
250 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
251 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
252 			clear_tsk_thread_flag(tsk, TIF_IA32);
253 		} else {
254 			set_tsk_thread_flag(tsk, TIF_IA32);
255 			current_thread_info()->status |= TS_COMPAT;
256 		}
257 	}
258 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
259 
260 	tsk->thread.debugreg0 = 0;
261 	tsk->thread.debugreg1 = 0;
262 	tsk->thread.debugreg2 = 0;
263 	tsk->thread.debugreg3 = 0;
264 	tsk->thread.debugreg6 = 0;
265 	tsk->thread.debugreg7 = 0;
266 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
267 	/*
268 	 * Forget coprocessor state..
269 	 */
270 	tsk->fpu_counter = 0;
271 	clear_fpu(tsk);
272 	clear_used_math();
273 }
274 
275 void release_thread(struct task_struct *dead_task)
276 {
277 	if (dead_task->mm) {
278 		if (dead_task->mm->context.size) {
279 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
280 					dead_task->comm,
281 					dead_task->mm->context.ldt,
282 					dead_task->mm->context.size);
283 			BUG();
284 		}
285 	}
286 }
287 
288 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
289 {
290 	struct user_desc ud = {
291 		.base_addr = addr,
292 		.limit = 0xfffff,
293 		.seg_32bit = 1,
294 		.limit_in_pages = 1,
295 		.useable = 1,
296 	};
297 	struct desc_struct *desc = t->thread.tls_array;
298 	desc += tls;
299 	fill_ldt(desc, &ud);
300 }
301 
302 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
303 {
304 	return get_desc_base(&t->thread.tls_array[tls]);
305 }
306 
307 /*
308  * This gets called before we allocate a new thread and copy
309  * the current task into it.
310  */
311 void prepare_to_copy(struct task_struct *tsk)
312 {
313 	unlazy_fpu(tsk);
314 }
315 
316 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
317 		unsigned long unused,
318 	struct task_struct * p, struct pt_regs * regs)
319 {
320 	int err;
321 	struct pt_regs * childregs;
322 	struct task_struct *me = current;
323 
324 	childregs = ((struct pt_regs *)
325 			(THREAD_SIZE + task_stack_page(p))) - 1;
326 	*childregs = *regs;
327 
328 	childregs->ax = 0;
329 	childregs->sp = sp;
330 	if (sp == ~0UL)
331 		childregs->sp = (unsigned long)childregs;
332 
333 	p->thread.sp = (unsigned long) childregs;
334 	p->thread.sp0 = (unsigned long) (childregs+1);
335 	p->thread.usersp = me->thread.usersp;
336 
337 	set_tsk_thread_flag(p, TIF_FORK);
338 
339 	p->thread.fs = me->thread.fs;
340 	p->thread.gs = me->thread.gs;
341 
342 	savesegment(gs, p->thread.gsindex);
343 	savesegment(fs, p->thread.fsindex);
344 	savesegment(es, p->thread.es);
345 	savesegment(ds, p->thread.ds);
346 
347 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
348 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
349 		if (!p->thread.io_bitmap_ptr) {
350 			p->thread.io_bitmap_max = 0;
351 			return -ENOMEM;
352 		}
353 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
354 				IO_BITMAP_BYTES);
355 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
356 	}
357 
358 	/*
359 	 * Set a new TLS for the child thread?
360 	 */
361 	if (clone_flags & CLONE_SETTLS) {
362 #ifdef CONFIG_IA32_EMULATION
363 		if (test_thread_flag(TIF_IA32))
364 			err = do_set_thread_area(p, -1,
365 				(struct user_desc __user *)childregs->si, 0);
366 		else
367 #endif
368 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
369 		if (err)
370 			goto out;
371 	}
372 	err = 0;
373 out:
374 	if (err && p->thread.io_bitmap_ptr) {
375 		kfree(p->thread.io_bitmap_ptr);
376 		p->thread.io_bitmap_max = 0;
377 	}
378 	return err;
379 }
380 
381 void
382 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
383 {
384 	loadsegment(fs, 0);
385 	loadsegment(es, 0);
386 	loadsegment(ds, 0);
387 	load_gs_index(0);
388 	regs->ip		= new_ip;
389 	regs->sp		= new_sp;
390 	write_pda(oldrsp, new_sp);
391 	regs->cs		= __USER_CS;
392 	regs->ss		= __USER_DS;
393 	regs->flags		= 0x200;
394 	set_fs(USER_DS);
395 	/*
396 	 * Free the old FP and other extended state
397 	 */
398 	free_thread_xstate(current);
399 }
400 EXPORT_SYMBOL_GPL(start_thread);
401 
402 static void hard_disable_TSC(void)
403 {
404 	write_cr4(read_cr4() | X86_CR4_TSD);
405 }
406 
407 void disable_TSC(void)
408 {
409 	preempt_disable();
410 	if (!test_and_set_thread_flag(TIF_NOTSC))
411 		/*
412 		 * Must flip the CPU state synchronously with
413 		 * TIF_NOTSC in the current running context.
414 		 */
415 		hard_disable_TSC();
416 	preempt_enable();
417 }
418 
419 static void hard_enable_TSC(void)
420 {
421 	write_cr4(read_cr4() & ~X86_CR4_TSD);
422 }
423 
424 static void enable_TSC(void)
425 {
426 	preempt_disable();
427 	if (test_and_clear_thread_flag(TIF_NOTSC))
428 		/*
429 		 * Must flip the CPU state synchronously with
430 		 * TIF_NOTSC in the current running context.
431 		 */
432 		hard_enable_TSC();
433 	preempt_enable();
434 }
435 
436 int get_tsc_mode(unsigned long adr)
437 {
438 	unsigned int val;
439 
440 	if (test_thread_flag(TIF_NOTSC))
441 		val = PR_TSC_SIGSEGV;
442 	else
443 		val = PR_TSC_ENABLE;
444 
445 	return put_user(val, (unsigned int __user *)adr);
446 }
447 
448 int set_tsc_mode(unsigned int val)
449 {
450 	if (val == PR_TSC_SIGSEGV)
451 		disable_TSC();
452 	else if (val == PR_TSC_ENABLE)
453 		enable_TSC();
454 	else
455 		return -EINVAL;
456 
457 	return 0;
458 }
459 
460 /*
461  * This special macro can be used to load a debugging register
462  */
463 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
464 
465 static inline void __switch_to_xtra(struct task_struct *prev_p,
466 				    struct task_struct *next_p,
467 				    struct tss_struct *tss)
468 {
469 	struct thread_struct *prev, *next;
470 	unsigned long debugctl;
471 
472 	prev = &prev_p->thread,
473 	next = &next_p->thread;
474 
475 	debugctl = prev->debugctlmsr;
476 	if (next->ds_area_msr != prev->ds_area_msr) {
477 		/* we clear debugctl to make sure DS
478 		 * is not in use when we change it */
479 		debugctl = 0;
480 		update_debugctlmsr(0);
481 		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
482 	}
483 
484 	if (next->debugctlmsr != debugctl)
485 		update_debugctlmsr(next->debugctlmsr);
486 
487 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
488 		loaddebug(next, 0);
489 		loaddebug(next, 1);
490 		loaddebug(next, 2);
491 		loaddebug(next, 3);
492 		/* no 4 and 5 */
493 		loaddebug(next, 6);
494 		loaddebug(next, 7);
495 	}
496 
497 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
498 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
499 		/* prev and next are different */
500 		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
501 			hard_disable_TSC();
502 		else
503 			hard_enable_TSC();
504 	}
505 
506 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
507 		/*
508 		 * Copy the relevant range of the IO bitmap.
509 		 * Normally this is 128 bytes or less:
510 		 */
511 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
512 		       max(prev->io_bitmap_max, next->io_bitmap_max));
513 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
514 		/*
515 		 * Clear any possible leftover bits:
516 		 */
517 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
518 	}
519 
520 #ifdef X86_BTS
521 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
522 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
523 
524 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
525 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
526 #endif
527 }
528 
529 /*
530  *	switch_to(x,y) should switch tasks from x to y.
531  *
532  * This could still be optimized:
533  * - fold all the options into a flag word and test it with a single test.
534  * - could test fs/gs bitsliced
535  *
536  * Kprobes not supported here. Set the probe on schedule instead.
537  */
538 struct task_struct *
539 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
540 {
541 	struct thread_struct *prev = &prev_p->thread;
542 	struct thread_struct *next = &next_p->thread;
543 	int cpu = smp_processor_id();
544 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
545 	unsigned fsindex, gsindex;
546 
547 	/* we're going to use this soon, after a few expensive things */
548 	if (next_p->fpu_counter>5)
549 		prefetch(next->xstate);
550 
551 	/*
552 	 * Reload esp0, LDT and the page table pointer:
553 	 */
554 	load_sp0(tss, next);
555 
556 	/*
557 	 * Switch DS and ES.
558 	 * This won't pick up thread selector changes, but I guess that is ok.
559 	 */
560 	savesegment(es, prev->es);
561 	if (unlikely(next->es | prev->es))
562 		loadsegment(es, next->es);
563 
564 	savesegment(ds, prev->ds);
565 	if (unlikely(next->ds | prev->ds))
566 		loadsegment(ds, next->ds);
567 
568 
569 	/* We must save %fs and %gs before load_TLS() because
570 	 * %fs and %gs may be cleared by load_TLS().
571 	 *
572 	 * (e.g. xen_load_tls())
573 	 */
574 	savesegment(fs, fsindex);
575 	savesegment(gs, gsindex);
576 
577 	load_TLS(next, cpu);
578 
579 	/*
580 	 * Leave lazy mode, flushing any hypercalls made here.
581 	 * This must be done before restoring TLS segments so
582 	 * the GDT and LDT are properly updated, and must be
583 	 * done before math_state_restore, so the TS bit is up
584 	 * to date.
585 	 */
586 	arch_leave_lazy_cpu_mode();
587 
588 	/*
589 	 * Switch FS and GS.
590 	 *
591 	 * Segment register != 0 always requires a reload.  Also
592 	 * reload when it has changed.  When prev process used 64bit
593 	 * base always reload to avoid an information leak.
594 	 */
595 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
596 		loadsegment(fs, next->fsindex);
597 		/*
598 		 * Check if the user used a selector != 0; if yes
599 		 *  clear 64bit base, since overloaded base is always
600 		 *  mapped to the Null selector
601 		 */
602 		if (fsindex)
603 			prev->fs = 0;
604 	}
605 	/* when next process has a 64bit base use it */
606 	if (next->fs)
607 		wrmsrl(MSR_FS_BASE, next->fs);
608 	prev->fsindex = fsindex;
609 
610 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
611 		load_gs_index(next->gsindex);
612 		if (gsindex)
613 			prev->gs = 0;
614 	}
615 	if (next->gs)
616 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
617 	prev->gsindex = gsindex;
618 
619 	/* Must be after DS reload */
620 	unlazy_fpu(prev_p);
621 
622 	/*
623 	 * Switch the PDA and FPU contexts.
624 	 */
625 	prev->usersp = read_pda(oldrsp);
626 	write_pda(oldrsp, next->usersp);
627 	write_pda(pcurrent, next_p);
628 
629 	write_pda(kernelstack,
630 		  (unsigned long)task_stack_page(next_p) +
631 		  THREAD_SIZE - PDA_STACKOFFSET);
632 #ifdef CONFIG_CC_STACKPROTECTOR
633 	write_pda(stack_canary, next_p->stack_canary);
634 	/*
635 	 * Build time only check to make sure the stack_canary is at
636 	 * offset 40 in the pda; this is a gcc ABI requirement
637 	 */
638 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
639 #endif
640 
641 	/*
642 	 * Now maybe reload the debug registers and handle I/O bitmaps
643 	 */
644 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
645 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
646 		__switch_to_xtra(prev_p, next_p, tss);
647 
648 	/* If the task has used fpu the last 5 timeslices, just do a full
649 	 * restore of the math state immediately to avoid the trap; the
650 	 * chances of needing FPU soon are obviously high now
651 	 *
652 	 * tsk_used_math() checks prevent calling math_state_restore(),
653 	 * which can sleep in the case of !tsk_used_math()
654 	 */
655 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
656 		math_state_restore();
657 	return prev_p;
658 }
659 
660 /*
661  * sys_execve() executes a new program.
662  */
663 asmlinkage
664 long sys_execve(char __user *name, char __user * __user *argv,
665 		char __user * __user *envp, struct pt_regs *regs)
666 {
667 	long error;
668 	char * filename;
669 
670 	filename = getname(name);
671 	error = PTR_ERR(filename);
672 	if (IS_ERR(filename))
673 		return error;
674 	error = do_execve(filename, argv, envp, regs);
675 	putname(filename);
676 	return error;
677 }
678 
679 void set_personality_64bit(void)
680 {
681 	/* inherit personality from parent */
682 
683 	/* Make sure to be in 64bit mode */
684 	clear_thread_flag(TIF_IA32);
685 
686 	/* TBD: overwrites user setup. Should have two bits.
687 	   But 64bit processes have always behaved this way,
688 	   so it's not too bad. The main problem is just that
689 	   32bit childs are affected again. */
690 	current->personality &= ~READ_IMPLIES_EXEC;
691 }
692 
693 asmlinkage long sys_fork(struct pt_regs *regs)
694 {
695 	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
696 }
697 
698 asmlinkage long
699 sys_clone(unsigned long clone_flags, unsigned long newsp,
700 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
701 {
702 	if (!newsp)
703 		newsp = regs->sp;
704 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
705 }
706 
707 /*
708  * This is trivial, and on the face of it looks like it
709  * could equally well be done in user mode.
710  *
711  * Not so, for quite unobvious reasons - register pressure.
712  * In user mode vfork() cannot have a stack frame, and if
713  * done by calling the "clone()" system call directly, you
714  * do not have enough call-clobbered registers to hold all
715  * the information you need.
716  */
717 asmlinkage long sys_vfork(struct pt_regs *regs)
718 {
719 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
720 		    NULL, NULL);
721 }
722 
723 unsigned long get_wchan(struct task_struct *p)
724 {
725 	unsigned long stack;
726 	u64 fp,ip;
727 	int count = 0;
728 
729 	if (!p || p == current || p->state==TASK_RUNNING)
730 		return 0;
731 	stack = (unsigned long)task_stack_page(p);
732 	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
733 		return 0;
734 	fp = *(u64 *)(p->thread.sp);
735 	do {
736 		if (fp < (unsigned long)stack ||
737 		    fp > (unsigned long)stack+THREAD_SIZE)
738 			return 0;
739 		ip = *(u64 *)(fp+8);
740 		if (!in_sched_functions(ip))
741 			return ip;
742 		fp = *(u64 *)fp;
743 	} while (count++ < 16);
744 	return 0;
745 }
746 
747 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
748 {
749 	int ret = 0;
750 	int doit = task == current;
751 	int cpu;
752 
753 	switch (code) {
754 	case ARCH_SET_GS:
755 		if (addr >= TASK_SIZE_OF(task))
756 			return -EPERM;
757 		cpu = get_cpu();
758 		/* handle small bases via the GDT because that's faster to
759 		   switch. */
760 		if (addr <= 0xffffffff) {
761 			set_32bit_tls(task, GS_TLS, addr);
762 			if (doit) {
763 				load_TLS(&task->thread, cpu);
764 				load_gs_index(GS_TLS_SEL);
765 			}
766 			task->thread.gsindex = GS_TLS_SEL;
767 			task->thread.gs = 0;
768 		} else {
769 			task->thread.gsindex = 0;
770 			task->thread.gs = addr;
771 			if (doit) {
772 				load_gs_index(0);
773 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
774 			}
775 		}
776 		put_cpu();
777 		break;
778 	case ARCH_SET_FS:
779 		/* Not strictly needed for fs, but do it for symmetry
780 		   with gs */
781 		if (addr >= TASK_SIZE_OF(task))
782 			return -EPERM;
783 		cpu = get_cpu();
784 		/* handle small bases via the GDT because that's faster to
785 		   switch. */
786 		if (addr <= 0xffffffff) {
787 			set_32bit_tls(task, FS_TLS, addr);
788 			if (doit) {
789 				load_TLS(&task->thread, cpu);
790 				loadsegment(fs, FS_TLS_SEL);
791 			}
792 			task->thread.fsindex = FS_TLS_SEL;
793 			task->thread.fs = 0;
794 		} else {
795 			task->thread.fsindex = 0;
796 			task->thread.fs = addr;
797 			if (doit) {
798 				/* set the selector to 0 to not confuse
799 				   __switch_to */
800 				loadsegment(fs, 0);
801 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
802 			}
803 		}
804 		put_cpu();
805 		break;
806 	case ARCH_GET_FS: {
807 		unsigned long base;
808 		if (task->thread.fsindex == FS_TLS_SEL)
809 			base = read_32bit_tls(task, FS_TLS);
810 		else if (doit)
811 			rdmsrl(MSR_FS_BASE, base);
812 		else
813 			base = task->thread.fs;
814 		ret = put_user(base, (unsigned long __user *)addr);
815 		break;
816 	}
817 	case ARCH_GET_GS: {
818 		unsigned long base;
819 		unsigned gsindex;
820 		if (task->thread.gsindex == GS_TLS_SEL)
821 			base = read_32bit_tls(task, GS_TLS);
822 		else if (doit) {
823 			savesegment(gs, gsindex);
824 			if (gsindex)
825 				rdmsrl(MSR_KERNEL_GS_BASE, base);
826 			else
827 				base = task->thread.gs;
828 		}
829 		else
830 			base = task->thread.gs;
831 		ret = put_user(base, (unsigned long __user *)addr);
832 		break;
833 	}
834 
835 	default:
836 		ret = -EINVAL;
837 		break;
838 	}
839 
840 	return ret;
841 }
842 
843 long sys_arch_prctl(int code, unsigned long addr)
844 {
845 	return do_arch_prctl(current, code, addr);
846 }
847 
848 unsigned long arch_align_stack(unsigned long sp)
849 {
850 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
851 		sp -= get_random_int() % 8192;
852 	return sp & ~0xf;
853 }
854 
855 unsigned long arch_randomize_brk(struct mm_struct *mm)
856 {
857 	unsigned long range_end = mm->brk + 0x02000000;
858 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
859 }
860