xref: /linux/arch/x86/kernel/process_64.c (revision 40d3057ac036f2501c1930728a6179be4fca577b)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *	Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *	Andi Kleen.
9  *
10  *	CPU hotplug support - ashok.raj@intel.com
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <stdarg.h>
18 
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
40 
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/i387.h>
47 #include <asm/mmu_context.h>
48 #include <asm/pda.h>
49 #include <asm/prctl.h>
50 #include <asm/desc.h>
51 #include <asm/proto.h>
52 #include <asm/ia32.h>
53 #include <asm/idle.h>
54 
55 asmlinkage extern void ret_from_fork(void);
56 
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
60 
61 void idle_notifier_register(struct notifier_block *n)
62 {
63 	atomic_notifier_chain_register(&idle_notifier, n);
64 }
65 
66 void enter_idle(void)
67 {
68 	write_pda(isidle, 1);
69 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
70 }
71 
72 static void __exit_idle(void)
73 {
74 	if (test_and_clear_bit_pda(0, isidle) == 0)
75 		return;
76 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
77 }
78 
79 /* Called from interrupts to signify idle end */
80 void exit_idle(void)
81 {
82 	/* idle loop has pid 0 */
83 	if (current->pid)
84 		return;
85 	__exit_idle();
86 }
87 
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state);
90 
91 #include <asm/nmi.h>
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
94 {
95 	idle_task_exit();
96 	mb();
97 	/* Ack it */
98 	__get_cpu_var(cpu_state) = CPU_DEAD;
99 
100 	local_irq_disable();
101 	/* mask all interrupts, flush any and all caches, and halt */
102 	wbinvd_halt();
103 }
104 #else
105 static inline void play_dead(void)
106 {
107 	BUG();
108 }
109 #endif /* CONFIG_HOTPLUG_CPU */
110 
111 /*
112  * The idle thread. There's no useful work to be
113  * done, so just try to conserve power and have a
114  * low exit latency (ie sit in a loop waiting for
115  * somebody to say that they'd like to reschedule)
116  */
117 void cpu_idle(void)
118 {
119 	current_thread_info()->status |= TS_POLLING;
120 	/* endless idle loop with no priority at all */
121 	while (1) {
122 		tick_nohz_stop_sched_tick(1);
123 		while (!need_resched()) {
124 
125 			rmb();
126 
127 			if (cpu_is_offline(smp_processor_id()))
128 				play_dead();
129 			/*
130 			 * Idle routines should keep interrupts disabled
131 			 * from here on, until they go to idle.
132 			 * Otherwise, idle callbacks can misfire.
133 			 */
134 			local_irq_disable();
135 			enter_idle();
136 			/* Don't trace irqs off for idle */
137 			stop_critical_timings();
138 			pm_idle();
139 			start_critical_timings();
140 			/* In many cases the interrupt that ended idle
141 			   has already called exit_idle. But some idle
142 			   loops can be woken up without interrupt. */
143 			__exit_idle();
144 		}
145 
146 		tick_nohz_restart_sched_tick();
147 		preempt_enable_no_resched();
148 		schedule();
149 		preempt_disable();
150 	}
151 }
152 
153 /* Prints also some state that isn't saved in the pt_regs */
154 void __show_regs(struct pt_regs * regs)
155 {
156 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
157 	unsigned long d0, d1, d2, d3, d6, d7;
158 	unsigned int fsindex, gsindex;
159 	unsigned int ds, cs, es;
160 
161 	printk("\n");
162 	print_modules();
163 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
164 		current->pid, current->comm, print_tainted(),
165 		init_utsname()->release,
166 		(int)strcspn(init_utsname()->version, " "),
167 		init_utsname()->version);
168 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169 	printk_address(regs->ip, 1);
170 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
171 		regs->flags);
172 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 	       regs->ax, regs->bx, regs->cx);
174 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 	       regs->dx, regs->si, regs->di);
176 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
177 	       regs->bp, regs->r8, regs->r9);
178 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
179 	       regs->r10, regs->r11, regs->r12);
180 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
181 	       regs->r13, regs->r14, regs->r15);
182 
183 	asm("movl %%ds,%0" : "=r" (ds));
184 	asm("movl %%cs,%0" : "=r" (cs));
185 	asm("movl %%es,%0" : "=r" (es));
186 	asm("movl %%fs,%0" : "=r" (fsindex));
187 	asm("movl %%gs,%0" : "=r" (gsindex));
188 
189 	rdmsrl(MSR_FS_BASE, fs);
190 	rdmsrl(MSR_GS_BASE, gs);
191 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
192 
193 	cr0 = read_cr0();
194 	cr2 = read_cr2();
195 	cr3 = read_cr3();
196 	cr4 = read_cr4();
197 
198 	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
199 	       fs,fsindex,gs,gsindex,shadowgs);
200 	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
201 	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
202 
203 	get_debugreg(d0, 0);
204 	get_debugreg(d1, 1);
205 	get_debugreg(d2, 2);
206 	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
207 	get_debugreg(d3, 3);
208 	get_debugreg(d6, 6);
209 	get_debugreg(d7, 7);
210 	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
211 }
212 
213 void show_regs(struct pt_regs *regs)
214 {
215 	printk("CPU %d:", smp_processor_id());
216 	__show_regs(regs);
217 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
218 }
219 
220 /*
221  * Free current thread data structures etc..
222  */
223 void exit_thread(void)
224 {
225 	struct task_struct *me = current;
226 	struct thread_struct *t = &me->thread;
227 
228 	if (me->thread.io_bitmap_ptr) {
229 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
230 
231 		kfree(t->io_bitmap_ptr);
232 		t->io_bitmap_ptr = NULL;
233 		clear_thread_flag(TIF_IO_BITMAP);
234 		/*
235 		 * Careful, clear this in the TSS too:
236 		 */
237 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
238 		t->io_bitmap_max = 0;
239 		put_cpu();
240 	}
241 }
242 
243 void flush_thread(void)
244 {
245 	struct task_struct *tsk = current;
246 
247 	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
248 		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
249 		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
250 			clear_tsk_thread_flag(tsk, TIF_IA32);
251 		} else {
252 			set_tsk_thread_flag(tsk, TIF_IA32);
253 			current_thread_info()->status |= TS_COMPAT;
254 		}
255 	}
256 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
257 
258 	tsk->thread.debugreg0 = 0;
259 	tsk->thread.debugreg1 = 0;
260 	tsk->thread.debugreg2 = 0;
261 	tsk->thread.debugreg3 = 0;
262 	tsk->thread.debugreg6 = 0;
263 	tsk->thread.debugreg7 = 0;
264 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
265 	/*
266 	 * Forget coprocessor state..
267 	 */
268 	tsk->fpu_counter = 0;
269 	clear_fpu(tsk);
270 	clear_used_math();
271 }
272 
273 void release_thread(struct task_struct *dead_task)
274 {
275 	if (dead_task->mm) {
276 		if (dead_task->mm->context.size) {
277 			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
278 					dead_task->comm,
279 					dead_task->mm->context.ldt,
280 					dead_task->mm->context.size);
281 			BUG();
282 		}
283 	}
284 }
285 
286 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
287 {
288 	struct user_desc ud = {
289 		.base_addr = addr,
290 		.limit = 0xfffff,
291 		.seg_32bit = 1,
292 		.limit_in_pages = 1,
293 		.useable = 1,
294 	};
295 	struct desc_struct *desc = t->thread.tls_array;
296 	desc += tls;
297 	fill_ldt(desc, &ud);
298 }
299 
300 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
301 {
302 	return get_desc_base(&t->thread.tls_array[tls]);
303 }
304 
305 /*
306  * This gets called before we allocate a new thread and copy
307  * the current task into it.
308  */
309 void prepare_to_copy(struct task_struct *tsk)
310 {
311 	unlazy_fpu(tsk);
312 }
313 
314 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
315 		unsigned long unused,
316 	struct task_struct * p, struct pt_regs * regs)
317 {
318 	int err;
319 	struct pt_regs * childregs;
320 	struct task_struct *me = current;
321 
322 	childregs = ((struct pt_regs *)
323 			(THREAD_SIZE + task_stack_page(p))) - 1;
324 	*childregs = *regs;
325 
326 	childregs->ax = 0;
327 	childregs->sp = sp;
328 	if (sp == ~0UL)
329 		childregs->sp = (unsigned long)childregs;
330 
331 	p->thread.sp = (unsigned long) childregs;
332 	p->thread.sp0 = (unsigned long) (childregs+1);
333 	p->thread.usersp = me->thread.usersp;
334 
335 	set_tsk_thread_flag(p, TIF_FORK);
336 
337 	p->thread.fs = me->thread.fs;
338 	p->thread.gs = me->thread.gs;
339 
340 	savesegment(gs, p->thread.gsindex);
341 	savesegment(fs, p->thread.fsindex);
342 	savesegment(es, p->thread.es);
343 	savesegment(ds, p->thread.ds);
344 
345 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
346 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
347 		if (!p->thread.io_bitmap_ptr) {
348 			p->thread.io_bitmap_max = 0;
349 			return -ENOMEM;
350 		}
351 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
352 				IO_BITMAP_BYTES);
353 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
354 	}
355 
356 	/*
357 	 * Set a new TLS for the child thread?
358 	 */
359 	if (clone_flags & CLONE_SETTLS) {
360 #ifdef CONFIG_IA32_EMULATION
361 		if (test_thread_flag(TIF_IA32))
362 			err = do_set_thread_area(p, -1,
363 				(struct user_desc __user *)childregs->si, 0);
364 		else
365 #endif
366 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
367 		if (err)
368 			goto out;
369 	}
370 	err = 0;
371 out:
372 	if (err && p->thread.io_bitmap_ptr) {
373 		kfree(p->thread.io_bitmap_ptr);
374 		p->thread.io_bitmap_max = 0;
375 	}
376 	return err;
377 }
378 
379 void
380 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
381 {
382 	loadsegment(fs, 0);
383 	loadsegment(es, 0);
384 	loadsegment(ds, 0);
385 	load_gs_index(0);
386 	regs->ip		= new_ip;
387 	regs->sp		= new_sp;
388 	write_pda(oldrsp, new_sp);
389 	regs->cs		= __USER_CS;
390 	regs->ss		= __USER_DS;
391 	regs->flags		= 0x200;
392 	set_fs(USER_DS);
393 	/*
394 	 * Free the old FP and other extended state
395 	 */
396 	free_thread_xstate(current);
397 }
398 EXPORT_SYMBOL_GPL(start_thread);
399 
400 static void hard_disable_TSC(void)
401 {
402 	write_cr4(read_cr4() | X86_CR4_TSD);
403 }
404 
405 void disable_TSC(void)
406 {
407 	preempt_disable();
408 	if (!test_and_set_thread_flag(TIF_NOTSC))
409 		/*
410 		 * Must flip the CPU state synchronously with
411 		 * TIF_NOTSC in the current running context.
412 		 */
413 		hard_disable_TSC();
414 	preempt_enable();
415 }
416 
417 static void hard_enable_TSC(void)
418 {
419 	write_cr4(read_cr4() & ~X86_CR4_TSD);
420 }
421 
422 static void enable_TSC(void)
423 {
424 	preempt_disable();
425 	if (test_and_clear_thread_flag(TIF_NOTSC))
426 		/*
427 		 * Must flip the CPU state synchronously with
428 		 * TIF_NOTSC in the current running context.
429 		 */
430 		hard_enable_TSC();
431 	preempt_enable();
432 }
433 
434 int get_tsc_mode(unsigned long adr)
435 {
436 	unsigned int val;
437 
438 	if (test_thread_flag(TIF_NOTSC))
439 		val = PR_TSC_SIGSEGV;
440 	else
441 		val = PR_TSC_ENABLE;
442 
443 	return put_user(val, (unsigned int __user *)adr);
444 }
445 
446 int set_tsc_mode(unsigned int val)
447 {
448 	if (val == PR_TSC_SIGSEGV)
449 		disable_TSC();
450 	else if (val == PR_TSC_ENABLE)
451 		enable_TSC();
452 	else
453 		return -EINVAL;
454 
455 	return 0;
456 }
457 
458 /*
459  * This special macro can be used to load a debugging register
460  */
461 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
462 
463 static inline void __switch_to_xtra(struct task_struct *prev_p,
464 				    struct task_struct *next_p,
465 				    struct tss_struct *tss)
466 {
467 	struct thread_struct *prev, *next;
468 	unsigned long debugctl;
469 
470 	prev = &prev_p->thread,
471 	next = &next_p->thread;
472 
473 	debugctl = prev->debugctlmsr;
474 	if (next->ds_area_msr != prev->ds_area_msr) {
475 		/* we clear debugctl to make sure DS
476 		 * is not in use when we change it */
477 		debugctl = 0;
478 		update_debugctlmsr(0);
479 		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
480 	}
481 
482 	if (next->debugctlmsr != debugctl)
483 		update_debugctlmsr(next->debugctlmsr);
484 
485 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
486 		loaddebug(next, 0);
487 		loaddebug(next, 1);
488 		loaddebug(next, 2);
489 		loaddebug(next, 3);
490 		/* no 4 and 5 */
491 		loaddebug(next, 6);
492 		loaddebug(next, 7);
493 	}
494 
495 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
496 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
497 		/* prev and next are different */
498 		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
499 			hard_disable_TSC();
500 		else
501 			hard_enable_TSC();
502 	}
503 
504 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
505 		/*
506 		 * Copy the relevant range of the IO bitmap.
507 		 * Normally this is 128 bytes or less:
508 		 */
509 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
510 		       max(prev->io_bitmap_max, next->io_bitmap_max));
511 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
512 		/*
513 		 * Clear any possible leftover bits:
514 		 */
515 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
516 	}
517 
518 #ifdef X86_BTS
519 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
520 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
521 
522 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
523 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
524 #endif
525 }
526 
527 /*
528  *	switch_to(x,y) should switch tasks from x to y.
529  *
530  * This could still be optimized:
531  * - fold all the options into a flag word and test it with a single test.
532  * - could test fs/gs bitsliced
533  *
534  * Kprobes not supported here. Set the probe on schedule instead.
535  */
536 struct task_struct *
537 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
538 {
539 	struct thread_struct *prev = &prev_p->thread;
540 	struct thread_struct *next = &next_p->thread;
541 	int cpu = smp_processor_id();
542 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
543 	unsigned fsindex, gsindex;
544 
545 	/* we're going to use this soon, after a few expensive things */
546 	if (next_p->fpu_counter>5)
547 		prefetch(next->xstate);
548 
549 	/*
550 	 * Reload esp0, LDT and the page table pointer:
551 	 */
552 	load_sp0(tss, next);
553 
554 	/*
555 	 * Switch DS and ES.
556 	 * This won't pick up thread selector changes, but I guess that is ok.
557 	 */
558 	savesegment(es, prev->es);
559 	if (unlikely(next->es | prev->es))
560 		loadsegment(es, next->es);
561 
562 	savesegment(ds, prev->ds);
563 	if (unlikely(next->ds | prev->ds))
564 		loadsegment(ds, next->ds);
565 
566 
567 	/* We must save %fs and %gs before load_TLS() because
568 	 * %fs and %gs may be cleared by load_TLS().
569 	 *
570 	 * (e.g. xen_load_tls())
571 	 */
572 	savesegment(fs, fsindex);
573 	savesegment(gs, gsindex);
574 
575 	load_TLS(next, cpu);
576 
577 	/*
578 	 * Leave lazy mode, flushing any hypercalls made here.
579 	 * This must be done before restoring TLS segments so
580 	 * the GDT and LDT are properly updated, and must be
581 	 * done before math_state_restore, so the TS bit is up
582 	 * to date.
583 	 */
584 	arch_leave_lazy_cpu_mode();
585 
586 	/*
587 	 * Switch FS and GS.
588 	 *
589 	 * Segment register != 0 always requires a reload.  Also
590 	 * reload when it has changed.  When prev process used 64bit
591 	 * base always reload to avoid an information leak.
592 	 */
593 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
594 		loadsegment(fs, next->fsindex);
595 		/*
596 		 * Check if the user used a selector != 0; if yes
597 		 *  clear 64bit base, since overloaded base is always
598 		 *  mapped to the Null selector
599 		 */
600 		if (fsindex)
601 			prev->fs = 0;
602 	}
603 	/* when next process has a 64bit base use it */
604 	if (next->fs)
605 		wrmsrl(MSR_FS_BASE, next->fs);
606 	prev->fsindex = fsindex;
607 
608 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
609 		load_gs_index(next->gsindex);
610 		if (gsindex)
611 			prev->gs = 0;
612 	}
613 	if (next->gs)
614 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
615 	prev->gsindex = gsindex;
616 
617 	/* Must be after DS reload */
618 	unlazy_fpu(prev_p);
619 
620 	/*
621 	 * Switch the PDA and FPU contexts.
622 	 */
623 	prev->usersp = read_pda(oldrsp);
624 	write_pda(oldrsp, next->usersp);
625 	write_pda(pcurrent, next_p);
626 
627 	write_pda(kernelstack,
628 		  (unsigned long)task_stack_page(next_p) +
629 		  THREAD_SIZE - PDA_STACKOFFSET);
630 #ifdef CONFIG_CC_STACKPROTECTOR
631 	write_pda(stack_canary, next_p->stack_canary);
632 	/*
633 	 * Build time only check to make sure the stack_canary is at
634 	 * offset 40 in the pda; this is a gcc ABI requirement
635 	 */
636 	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
637 #endif
638 
639 	/*
640 	 * Now maybe reload the debug registers and handle I/O bitmaps
641 	 */
642 	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
643 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
644 		__switch_to_xtra(prev_p, next_p, tss);
645 
646 	/* If the task has used fpu the last 5 timeslices, just do a full
647 	 * restore of the math state immediately to avoid the trap; the
648 	 * chances of needing FPU soon are obviously high now
649 	 *
650 	 * tsk_used_math() checks prevent calling math_state_restore(),
651 	 * which can sleep in the case of !tsk_used_math()
652 	 */
653 	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
654 		math_state_restore();
655 	return prev_p;
656 }
657 
658 /*
659  * sys_execve() executes a new program.
660  */
661 asmlinkage
662 long sys_execve(char __user *name, char __user * __user *argv,
663 		char __user * __user *envp, struct pt_regs *regs)
664 {
665 	long error;
666 	char * filename;
667 
668 	filename = getname(name);
669 	error = PTR_ERR(filename);
670 	if (IS_ERR(filename))
671 		return error;
672 	error = do_execve(filename, argv, envp, regs);
673 	putname(filename);
674 	return error;
675 }
676 
677 void set_personality_64bit(void)
678 {
679 	/* inherit personality from parent */
680 
681 	/* Make sure to be in 64bit mode */
682 	clear_thread_flag(TIF_IA32);
683 
684 	/* TBD: overwrites user setup. Should have two bits.
685 	   But 64bit processes have always behaved this way,
686 	   so it's not too bad. The main problem is just that
687 	   32bit childs are affected again. */
688 	current->personality &= ~READ_IMPLIES_EXEC;
689 }
690 
691 asmlinkage long sys_fork(struct pt_regs *regs)
692 {
693 	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
694 }
695 
696 asmlinkage long
697 sys_clone(unsigned long clone_flags, unsigned long newsp,
698 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
699 {
700 	if (!newsp)
701 		newsp = regs->sp;
702 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
703 }
704 
705 /*
706  * This is trivial, and on the face of it looks like it
707  * could equally well be done in user mode.
708  *
709  * Not so, for quite unobvious reasons - register pressure.
710  * In user mode vfork() cannot have a stack frame, and if
711  * done by calling the "clone()" system call directly, you
712  * do not have enough call-clobbered registers to hold all
713  * the information you need.
714  */
715 asmlinkage long sys_vfork(struct pt_regs *regs)
716 {
717 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
718 		    NULL, NULL);
719 }
720 
721 unsigned long get_wchan(struct task_struct *p)
722 {
723 	unsigned long stack;
724 	u64 fp,ip;
725 	int count = 0;
726 
727 	if (!p || p == current || p->state==TASK_RUNNING)
728 		return 0;
729 	stack = (unsigned long)task_stack_page(p);
730 	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
731 		return 0;
732 	fp = *(u64 *)(p->thread.sp);
733 	do {
734 		if (fp < (unsigned long)stack ||
735 		    fp > (unsigned long)stack+THREAD_SIZE)
736 			return 0;
737 		ip = *(u64 *)(fp+8);
738 		if (!in_sched_functions(ip))
739 			return ip;
740 		fp = *(u64 *)fp;
741 	} while (count++ < 16);
742 	return 0;
743 }
744 
745 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
746 {
747 	int ret = 0;
748 	int doit = task == current;
749 	int cpu;
750 
751 	switch (code) {
752 	case ARCH_SET_GS:
753 		if (addr >= TASK_SIZE_OF(task))
754 			return -EPERM;
755 		cpu = get_cpu();
756 		/* handle small bases via the GDT because that's faster to
757 		   switch. */
758 		if (addr <= 0xffffffff) {
759 			set_32bit_tls(task, GS_TLS, addr);
760 			if (doit) {
761 				load_TLS(&task->thread, cpu);
762 				load_gs_index(GS_TLS_SEL);
763 			}
764 			task->thread.gsindex = GS_TLS_SEL;
765 			task->thread.gs = 0;
766 		} else {
767 			task->thread.gsindex = 0;
768 			task->thread.gs = addr;
769 			if (doit) {
770 				load_gs_index(0);
771 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
772 			}
773 		}
774 		put_cpu();
775 		break;
776 	case ARCH_SET_FS:
777 		/* Not strictly needed for fs, but do it for symmetry
778 		   with gs */
779 		if (addr >= TASK_SIZE_OF(task))
780 			return -EPERM;
781 		cpu = get_cpu();
782 		/* handle small bases via the GDT because that's faster to
783 		   switch. */
784 		if (addr <= 0xffffffff) {
785 			set_32bit_tls(task, FS_TLS, addr);
786 			if (doit) {
787 				load_TLS(&task->thread, cpu);
788 				loadsegment(fs, FS_TLS_SEL);
789 			}
790 			task->thread.fsindex = FS_TLS_SEL;
791 			task->thread.fs = 0;
792 		} else {
793 			task->thread.fsindex = 0;
794 			task->thread.fs = addr;
795 			if (doit) {
796 				/* set the selector to 0 to not confuse
797 				   __switch_to */
798 				loadsegment(fs, 0);
799 				ret = checking_wrmsrl(MSR_FS_BASE, addr);
800 			}
801 		}
802 		put_cpu();
803 		break;
804 	case ARCH_GET_FS: {
805 		unsigned long base;
806 		if (task->thread.fsindex == FS_TLS_SEL)
807 			base = read_32bit_tls(task, FS_TLS);
808 		else if (doit)
809 			rdmsrl(MSR_FS_BASE, base);
810 		else
811 			base = task->thread.fs;
812 		ret = put_user(base, (unsigned long __user *)addr);
813 		break;
814 	}
815 	case ARCH_GET_GS: {
816 		unsigned long base;
817 		unsigned gsindex;
818 		if (task->thread.gsindex == GS_TLS_SEL)
819 			base = read_32bit_tls(task, GS_TLS);
820 		else if (doit) {
821 			savesegment(gs, gsindex);
822 			if (gsindex)
823 				rdmsrl(MSR_KERNEL_GS_BASE, base);
824 			else
825 				base = task->thread.gs;
826 		}
827 		else
828 			base = task->thread.gs;
829 		ret = put_user(base, (unsigned long __user *)addr);
830 		break;
831 	}
832 
833 	default:
834 		ret = -EINVAL;
835 		break;
836 	}
837 
838 	return ret;
839 }
840 
841 long sys_arch_prctl(int code, unsigned long addr)
842 {
843 	return do_arch_prctl(current, code, addr);
844 }
845 
846 unsigned long arch_align_stack(unsigned long sp)
847 {
848 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
849 		sp -= get_random_int() % 8192;
850 	return sp & ~0xf;
851 }
852 
853 unsigned long arch_randomize_brk(struct mm_struct *mm)
854 {
855 	unsigned long range_end = mm->brk + 0x02000000;
856 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
857 }
858