1 /* 2 * linux/arch/x86-64/kernel/process.c 3 * 4 * Copyright (C) 1995 Linus Torvalds 5 * 6 * Pentium III FXSR, SSE support 7 * Gareth Hughes <gareth@valinux.com>, May 2000 8 * 9 * X86-64 port 10 * Andi Kleen. 11 * 12 * CPU hotplug support - ashok.raj@intel.com 13 */ 14 15 /* 16 * This file handles the architecture-dependent parts of process handling.. 17 */ 18 19 #include <stdarg.h> 20 21 #include <linux/cpu.h> 22 #include <linux/errno.h> 23 #include <linux/sched.h> 24 #include <linux/kernel.h> 25 #include <linux/mm.h> 26 #include <linux/fs.h> 27 #include <linux/elfcore.h> 28 #include <linux/smp.h> 29 #include <linux/slab.h> 30 #include <linux/user.h> 31 #include <linux/module.h> 32 #include <linux/a.out.h> 33 #include <linux/interrupt.h> 34 #include <linux/delay.h> 35 #include <linux/ptrace.h> 36 #include <linux/utsname.h> 37 #include <linux/random.h> 38 #include <linux/notifier.h> 39 #include <linux/kprobes.h> 40 #include <linux/kdebug.h> 41 #include <linux/tick.h> 42 43 #include <asm/uaccess.h> 44 #include <asm/pgtable.h> 45 #include <asm/system.h> 46 #include <asm/io.h> 47 #include <asm/processor.h> 48 #include <asm/i387.h> 49 #include <asm/mmu_context.h> 50 #include <asm/pda.h> 51 #include <asm/prctl.h> 52 #include <asm/desc.h> 53 #include <asm/proto.h> 54 #include <asm/ia32.h> 55 #include <asm/idle.h> 56 57 asmlinkage extern void ret_from_fork(void); 58 59 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 60 61 unsigned long boot_option_idle_override = 0; 62 EXPORT_SYMBOL(boot_option_idle_override); 63 64 /* 65 * Powermanagement idle function, if any.. 66 */ 67 void (*pm_idle)(void); 68 EXPORT_SYMBOL(pm_idle); 69 static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 70 71 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 72 73 void idle_notifier_register(struct notifier_block *n) 74 { 75 atomic_notifier_chain_register(&idle_notifier, n); 76 } 77 EXPORT_SYMBOL_GPL(idle_notifier_register); 78 79 void idle_notifier_unregister(struct notifier_block *n) 80 { 81 atomic_notifier_chain_unregister(&idle_notifier, n); 82 } 83 EXPORT_SYMBOL(idle_notifier_unregister); 84 85 void enter_idle(void) 86 { 87 write_pda(isidle, 1); 88 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 89 } 90 91 static void __exit_idle(void) 92 { 93 if (test_and_clear_bit_pda(0, isidle) == 0) 94 return; 95 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 96 } 97 98 /* Called from interrupts to signify idle end */ 99 void exit_idle(void) 100 { 101 /* idle loop has pid 0 */ 102 if (current->pid) 103 return; 104 __exit_idle(); 105 } 106 107 /* 108 * We use this if we don't have any better 109 * idle routine.. 110 */ 111 static void default_idle(void) 112 { 113 current_thread_info()->status &= ~TS_POLLING; 114 /* 115 * TS_POLLING-cleared state must be visible before we 116 * test NEED_RESCHED: 117 */ 118 smp_mb(); 119 local_irq_disable(); 120 if (!need_resched()) { 121 /* Enables interrupts one instruction before HLT. 122 x86 special cases this so there is no race. */ 123 safe_halt(); 124 } else 125 local_irq_enable(); 126 current_thread_info()->status |= TS_POLLING; 127 } 128 129 /* 130 * On SMP it's slightly faster (but much more power-consuming!) 131 * to poll the ->need_resched flag instead of waiting for the 132 * cross-CPU IPI to arrive. Use this option with caution. 133 */ 134 static void poll_idle (void) 135 { 136 local_irq_enable(); 137 cpu_relax(); 138 } 139 140 void cpu_idle_wait(void) 141 { 142 unsigned int cpu, this_cpu = get_cpu(); 143 cpumask_t map, tmp = current->cpus_allowed; 144 145 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 146 put_cpu(); 147 148 cpus_clear(map); 149 for_each_online_cpu(cpu) { 150 per_cpu(cpu_idle_state, cpu) = 1; 151 cpu_set(cpu, map); 152 } 153 154 __get_cpu_var(cpu_idle_state) = 0; 155 156 wmb(); 157 do { 158 ssleep(1); 159 for_each_online_cpu(cpu) { 160 if (cpu_isset(cpu, map) && 161 !per_cpu(cpu_idle_state, cpu)) 162 cpu_clear(cpu, map); 163 } 164 cpus_and(map, map, cpu_online_map); 165 } while (!cpus_empty(map)); 166 167 set_cpus_allowed(current, tmp); 168 } 169 EXPORT_SYMBOL_GPL(cpu_idle_wait); 170 171 #ifdef CONFIG_HOTPLUG_CPU 172 DECLARE_PER_CPU(int, cpu_state); 173 174 #include <asm/nmi.h> 175 /* We halt the CPU with physical CPU hotplug */ 176 static inline void play_dead(void) 177 { 178 idle_task_exit(); 179 wbinvd(); 180 mb(); 181 /* Ack it */ 182 __get_cpu_var(cpu_state) = CPU_DEAD; 183 184 local_irq_disable(); 185 while (1) 186 halt(); 187 } 188 #else 189 static inline void play_dead(void) 190 { 191 BUG(); 192 } 193 #endif /* CONFIG_HOTPLUG_CPU */ 194 195 /* 196 * The idle thread. There's no useful work to be 197 * done, so just try to conserve power and have a 198 * low exit latency (ie sit in a loop waiting for 199 * somebody to say that they'd like to reschedule) 200 */ 201 void cpu_idle (void) 202 { 203 current_thread_info()->status |= TS_POLLING; 204 /* endless idle loop with no priority at all */ 205 while (1) { 206 while (!need_resched()) { 207 void (*idle)(void); 208 209 if (__get_cpu_var(cpu_idle_state)) 210 __get_cpu_var(cpu_idle_state) = 0; 211 212 tick_nohz_stop_sched_tick(); 213 214 rmb(); 215 idle = pm_idle; 216 if (!idle) 217 idle = default_idle; 218 if (cpu_is_offline(smp_processor_id())) 219 play_dead(); 220 /* 221 * Idle routines should keep interrupts disabled 222 * from here on, until they go to idle. 223 * Otherwise, idle callbacks can misfire. 224 */ 225 local_irq_disable(); 226 enter_idle(); 227 idle(); 228 /* In many cases the interrupt that ended idle 229 has already called exit_idle. But some idle 230 loops can be woken up without interrupt. */ 231 __exit_idle(); 232 } 233 234 tick_nohz_restart_sched_tick(); 235 preempt_enable_no_resched(); 236 schedule(); 237 preempt_disable(); 238 } 239 } 240 241 /* 242 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 243 * which can obviate IPI to trigger checking of need_resched. 244 * We execute MONITOR against need_resched and enter optimized wait state 245 * through MWAIT. Whenever someone changes need_resched, we would be woken 246 * up from MWAIT (without an IPI). 247 * 248 * New with Core Duo processors, MWAIT can take some hints based on CPU 249 * capability. 250 */ 251 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 252 { 253 if (!need_resched()) { 254 __monitor((void *)¤t_thread_info()->flags, 0, 0); 255 smp_mb(); 256 if (!need_resched()) 257 __mwait(eax, ecx); 258 } 259 } 260 261 /* Default MONITOR/MWAIT with no hints, used for default C1 state */ 262 static void mwait_idle(void) 263 { 264 if (!need_resched()) { 265 __monitor((void *)¤t_thread_info()->flags, 0, 0); 266 smp_mb(); 267 if (!need_resched()) 268 __sti_mwait(0, 0); 269 else 270 local_irq_enable(); 271 } else { 272 local_irq_enable(); 273 } 274 } 275 276 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 277 { 278 static int printed; 279 if (cpu_has(c, X86_FEATURE_MWAIT)) { 280 /* 281 * Skip, if setup has overridden idle. 282 * One CPU supports mwait => All CPUs supports mwait 283 */ 284 if (!pm_idle) { 285 if (!printed) { 286 printk(KERN_INFO "using mwait in idle threads.\n"); 287 printed = 1; 288 } 289 pm_idle = mwait_idle; 290 } 291 } 292 } 293 294 static int __init idle_setup (char *str) 295 { 296 if (!strcmp(str, "poll")) { 297 printk("using polling idle threads.\n"); 298 pm_idle = poll_idle; 299 } else if (!strcmp(str, "mwait")) 300 force_mwait = 1; 301 else 302 return -1; 303 304 boot_option_idle_override = 1; 305 return 0; 306 } 307 early_param("idle", idle_setup); 308 309 /* Prints also some state that isn't saved in the pt_regs */ 310 void __show_regs(struct pt_regs * regs) 311 { 312 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 313 unsigned long d0, d1, d2, d3, d6, d7; 314 unsigned int fsindex,gsindex; 315 unsigned int ds,cs,es; 316 317 printk("\n"); 318 print_modules(); 319 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 320 current->pid, current->comm, print_tainted(), 321 init_utsname()->release, 322 (int)strcspn(init_utsname()->version, " "), 323 init_utsname()->version); 324 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); 325 printk_address(regs->rip); 326 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, 327 regs->eflags); 328 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 329 regs->rax, regs->rbx, regs->rcx); 330 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 331 regs->rdx, regs->rsi, regs->rdi); 332 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 333 regs->rbp, regs->r8, regs->r9); 334 printk("R10: %016lx R11: %016lx R12: %016lx\n", 335 regs->r10, regs->r11, regs->r12); 336 printk("R13: %016lx R14: %016lx R15: %016lx\n", 337 regs->r13, regs->r14, regs->r15); 338 339 asm("movl %%ds,%0" : "=r" (ds)); 340 asm("movl %%cs,%0" : "=r" (cs)); 341 asm("movl %%es,%0" : "=r" (es)); 342 asm("movl %%fs,%0" : "=r" (fsindex)); 343 asm("movl %%gs,%0" : "=r" (gsindex)); 344 345 rdmsrl(MSR_FS_BASE, fs); 346 rdmsrl(MSR_GS_BASE, gs); 347 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 348 349 cr0 = read_cr0(); 350 cr2 = read_cr2(); 351 cr3 = read_cr3(); 352 cr4 = read_cr4(); 353 354 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 355 fs,fsindex,gs,gsindex,shadowgs); 356 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 357 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 358 359 get_debugreg(d0, 0); 360 get_debugreg(d1, 1); 361 get_debugreg(d2, 2); 362 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 363 get_debugreg(d3, 3); 364 get_debugreg(d6, 6); 365 get_debugreg(d7, 7); 366 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 367 } 368 369 void show_regs(struct pt_regs *regs) 370 { 371 printk("CPU %d:", smp_processor_id()); 372 __show_regs(regs); 373 show_trace(NULL, regs, (void *)(regs + 1)); 374 } 375 376 /* 377 * Free current thread data structures etc.. 378 */ 379 void exit_thread(void) 380 { 381 struct task_struct *me = current; 382 struct thread_struct *t = &me->thread; 383 384 if (me->thread.io_bitmap_ptr) { 385 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 386 387 kfree(t->io_bitmap_ptr); 388 t->io_bitmap_ptr = NULL; 389 clear_thread_flag(TIF_IO_BITMAP); 390 /* 391 * Careful, clear this in the TSS too: 392 */ 393 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 394 t->io_bitmap_max = 0; 395 put_cpu(); 396 } 397 } 398 399 void flush_thread(void) 400 { 401 struct task_struct *tsk = current; 402 403 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 404 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 405 if (test_tsk_thread_flag(tsk, TIF_IA32)) { 406 clear_tsk_thread_flag(tsk, TIF_IA32); 407 } else { 408 set_tsk_thread_flag(tsk, TIF_IA32); 409 current_thread_info()->status |= TS_COMPAT; 410 } 411 } 412 clear_tsk_thread_flag(tsk, TIF_DEBUG); 413 414 tsk->thread.debugreg0 = 0; 415 tsk->thread.debugreg1 = 0; 416 tsk->thread.debugreg2 = 0; 417 tsk->thread.debugreg3 = 0; 418 tsk->thread.debugreg6 = 0; 419 tsk->thread.debugreg7 = 0; 420 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 421 /* 422 * Forget coprocessor state.. 423 */ 424 clear_fpu(tsk); 425 clear_used_math(); 426 } 427 428 void release_thread(struct task_struct *dead_task) 429 { 430 if (dead_task->mm) { 431 if (dead_task->mm->context.size) { 432 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 433 dead_task->comm, 434 dead_task->mm->context.ldt, 435 dead_task->mm->context.size); 436 BUG(); 437 } 438 } 439 } 440 441 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 442 { 443 struct user_desc ud = { 444 .base_addr = addr, 445 .limit = 0xfffff, 446 .seg_32bit = 1, 447 .limit_in_pages = 1, 448 .useable = 1, 449 }; 450 struct n_desc_struct *desc = (void *)t->thread.tls_array; 451 desc += tls; 452 desc->a = LDT_entry_a(&ud); 453 desc->b = LDT_entry_b(&ud); 454 } 455 456 static inline u32 read_32bit_tls(struct task_struct *t, int tls) 457 { 458 struct desc_struct *desc = (void *)t->thread.tls_array; 459 desc += tls; 460 return desc->base0 | 461 (((u32)desc->base1) << 16) | 462 (((u32)desc->base2) << 24); 463 } 464 465 /* 466 * This gets called before we allocate a new thread and copy 467 * the current task into it. 468 */ 469 void prepare_to_copy(struct task_struct *tsk) 470 { 471 unlazy_fpu(tsk); 472 } 473 474 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 475 unsigned long unused, 476 struct task_struct * p, struct pt_regs * regs) 477 { 478 int err; 479 struct pt_regs * childregs; 480 struct task_struct *me = current; 481 482 childregs = ((struct pt_regs *) 483 (THREAD_SIZE + task_stack_page(p))) - 1; 484 *childregs = *regs; 485 486 childregs->rax = 0; 487 childregs->rsp = rsp; 488 if (rsp == ~0UL) 489 childregs->rsp = (unsigned long)childregs; 490 491 p->thread.rsp = (unsigned long) childregs; 492 p->thread.rsp0 = (unsigned long) (childregs+1); 493 p->thread.userrsp = me->thread.userrsp; 494 495 set_tsk_thread_flag(p, TIF_FORK); 496 497 p->thread.fs = me->thread.fs; 498 p->thread.gs = me->thread.gs; 499 500 asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); 501 asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); 502 asm("mov %%es,%0" : "=m" (p->thread.es)); 503 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 504 505 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 506 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 507 if (!p->thread.io_bitmap_ptr) { 508 p->thread.io_bitmap_max = 0; 509 return -ENOMEM; 510 } 511 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 512 IO_BITMAP_BYTES); 513 set_tsk_thread_flag(p, TIF_IO_BITMAP); 514 } 515 516 /* 517 * Set a new TLS for the child thread? 518 */ 519 if (clone_flags & CLONE_SETTLS) { 520 #ifdef CONFIG_IA32_EMULATION 521 if (test_thread_flag(TIF_IA32)) 522 err = ia32_child_tls(p, childregs); 523 else 524 #endif 525 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 526 if (err) 527 goto out; 528 } 529 err = 0; 530 out: 531 if (err && p->thread.io_bitmap_ptr) { 532 kfree(p->thread.io_bitmap_ptr); 533 p->thread.io_bitmap_max = 0; 534 } 535 return err; 536 } 537 538 /* 539 * This special macro can be used to load a debugging register 540 */ 541 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 542 543 static inline void __switch_to_xtra(struct task_struct *prev_p, 544 struct task_struct *next_p, 545 struct tss_struct *tss) 546 { 547 struct thread_struct *prev, *next; 548 549 prev = &prev_p->thread, 550 next = &next_p->thread; 551 552 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 553 loaddebug(next, 0); 554 loaddebug(next, 1); 555 loaddebug(next, 2); 556 loaddebug(next, 3); 557 /* no 4 and 5 */ 558 loaddebug(next, 6); 559 loaddebug(next, 7); 560 } 561 562 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 563 /* 564 * Copy the relevant range of the IO bitmap. 565 * Normally this is 128 bytes or less: 566 */ 567 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 568 max(prev->io_bitmap_max, next->io_bitmap_max)); 569 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 570 /* 571 * Clear any possible leftover bits: 572 */ 573 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 574 } 575 } 576 577 /* 578 * switch_to(x,y) should switch tasks from x to y. 579 * 580 * This could still be optimized: 581 * - fold all the options into a flag word and test it with a single test. 582 * - could test fs/gs bitsliced 583 * 584 * Kprobes not supported here. Set the probe on schedule instead. 585 */ 586 __kprobes struct task_struct * 587 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 588 { 589 struct thread_struct *prev = &prev_p->thread, 590 *next = &next_p->thread; 591 int cpu = smp_processor_id(); 592 struct tss_struct *tss = &per_cpu(init_tss, cpu); 593 594 /* we're going to use this soon, after a few expensive things */ 595 if (next_p->fpu_counter>5) 596 prefetch(&next->i387.fxsave); 597 598 /* 599 * Reload esp0, LDT and the page table pointer: 600 */ 601 tss->rsp0 = next->rsp0; 602 603 /* 604 * Switch DS and ES. 605 * This won't pick up thread selector changes, but I guess that is ok. 606 */ 607 asm volatile("mov %%es,%0" : "=m" (prev->es)); 608 if (unlikely(next->es | prev->es)) 609 loadsegment(es, next->es); 610 611 asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); 612 if (unlikely(next->ds | prev->ds)) 613 loadsegment(ds, next->ds); 614 615 load_TLS(next, cpu); 616 617 /* 618 * Switch FS and GS. 619 */ 620 { 621 unsigned fsindex; 622 asm volatile("movl %%fs,%0" : "=r" (fsindex)); 623 /* segment register != 0 always requires a reload. 624 also reload when it has changed. 625 when prev process used 64bit base always reload 626 to avoid an information leak. */ 627 if (unlikely(fsindex | next->fsindex | prev->fs)) { 628 loadsegment(fs, next->fsindex); 629 /* check if the user used a selector != 0 630 * if yes clear 64bit base, since overloaded base 631 * is always mapped to the Null selector 632 */ 633 if (fsindex) 634 prev->fs = 0; 635 } 636 /* when next process has a 64bit base use it */ 637 if (next->fs) 638 wrmsrl(MSR_FS_BASE, next->fs); 639 prev->fsindex = fsindex; 640 } 641 { 642 unsigned gsindex; 643 asm volatile("movl %%gs,%0" : "=r" (gsindex)); 644 if (unlikely(gsindex | next->gsindex | prev->gs)) { 645 load_gs_index(next->gsindex); 646 if (gsindex) 647 prev->gs = 0; 648 } 649 if (next->gs) 650 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 651 prev->gsindex = gsindex; 652 } 653 654 /* Must be after DS reload */ 655 unlazy_fpu(prev_p); 656 657 /* 658 * Switch the PDA and FPU contexts. 659 */ 660 prev->userrsp = read_pda(oldrsp); 661 write_pda(oldrsp, next->userrsp); 662 write_pda(pcurrent, next_p); 663 664 write_pda(kernelstack, 665 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 666 #ifdef CONFIG_CC_STACKPROTECTOR 667 write_pda(stack_canary, next_p->stack_canary); 668 /* 669 * Build time only check to make sure the stack_canary is at 670 * offset 40 in the pda; this is a gcc ABI requirement 671 */ 672 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); 673 #endif 674 675 /* 676 * Now maybe reload the debug registers and handle I/O bitmaps 677 */ 678 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) 679 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) 680 __switch_to_xtra(prev_p, next_p, tss); 681 682 /* If the task has used fpu the last 5 timeslices, just do a full 683 * restore of the math state immediately to avoid the trap; the 684 * chances of needing FPU soon are obviously high now 685 */ 686 if (next_p->fpu_counter>5) 687 math_state_restore(); 688 return prev_p; 689 } 690 691 /* 692 * sys_execve() executes a new program. 693 */ 694 asmlinkage 695 long sys_execve(char __user *name, char __user * __user *argv, 696 char __user * __user *envp, struct pt_regs regs) 697 { 698 long error; 699 char * filename; 700 701 filename = getname(name); 702 error = PTR_ERR(filename); 703 if (IS_ERR(filename)) 704 return error; 705 error = do_execve(filename, argv, envp, ®s); 706 if (error == 0) { 707 task_lock(current); 708 current->ptrace &= ~PT_DTRACE; 709 task_unlock(current); 710 } 711 putname(filename); 712 return error; 713 } 714 715 void set_personality_64bit(void) 716 { 717 /* inherit personality from parent */ 718 719 /* Make sure to be in 64bit mode */ 720 clear_thread_flag(TIF_IA32); 721 722 /* TBD: overwrites user setup. Should have two bits. 723 But 64bit processes have always behaved this way, 724 so it's not too bad. The main problem is just that 725 32bit childs are affected again. */ 726 current->personality &= ~READ_IMPLIES_EXEC; 727 } 728 729 asmlinkage long sys_fork(struct pt_regs *regs) 730 { 731 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); 732 } 733 734 asmlinkage long 735 sys_clone(unsigned long clone_flags, unsigned long newsp, 736 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 737 { 738 if (!newsp) 739 newsp = regs->rsp; 740 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 741 } 742 743 /* 744 * This is trivial, and on the face of it looks like it 745 * could equally well be done in user mode. 746 * 747 * Not so, for quite unobvious reasons - register pressure. 748 * In user mode vfork() cannot have a stack frame, and if 749 * done by calling the "clone()" system call directly, you 750 * do not have enough call-clobbered registers to hold all 751 * the information you need. 752 */ 753 asmlinkage long sys_vfork(struct pt_regs *regs) 754 { 755 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, 756 NULL, NULL); 757 } 758 759 unsigned long get_wchan(struct task_struct *p) 760 { 761 unsigned long stack; 762 u64 fp,rip; 763 int count = 0; 764 765 if (!p || p == current || p->state==TASK_RUNNING) 766 return 0; 767 stack = (unsigned long)task_stack_page(p); 768 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) 769 return 0; 770 fp = *(u64 *)(p->thread.rsp); 771 do { 772 if (fp < (unsigned long)stack || 773 fp > (unsigned long)stack+THREAD_SIZE) 774 return 0; 775 rip = *(u64 *)(fp+8); 776 if (!in_sched_functions(rip)) 777 return rip; 778 fp = *(u64 *)fp; 779 } while (count++ < 16); 780 return 0; 781 } 782 783 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 784 { 785 int ret = 0; 786 int doit = task == current; 787 int cpu; 788 789 switch (code) { 790 case ARCH_SET_GS: 791 if (addr >= TASK_SIZE_OF(task)) 792 return -EPERM; 793 cpu = get_cpu(); 794 /* handle small bases via the GDT because that's faster to 795 switch. */ 796 if (addr <= 0xffffffff) { 797 set_32bit_tls(task, GS_TLS, addr); 798 if (doit) { 799 load_TLS(&task->thread, cpu); 800 load_gs_index(GS_TLS_SEL); 801 } 802 task->thread.gsindex = GS_TLS_SEL; 803 task->thread.gs = 0; 804 } else { 805 task->thread.gsindex = 0; 806 task->thread.gs = addr; 807 if (doit) { 808 load_gs_index(0); 809 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 810 } 811 } 812 put_cpu(); 813 break; 814 case ARCH_SET_FS: 815 /* Not strictly needed for fs, but do it for symmetry 816 with gs */ 817 if (addr >= TASK_SIZE_OF(task)) 818 return -EPERM; 819 cpu = get_cpu(); 820 /* handle small bases via the GDT because that's faster to 821 switch. */ 822 if (addr <= 0xffffffff) { 823 set_32bit_tls(task, FS_TLS, addr); 824 if (doit) { 825 load_TLS(&task->thread, cpu); 826 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 827 } 828 task->thread.fsindex = FS_TLS_SEL; 829 task->thread.fs = 0; 830 } else { 831 task->thread.fsindex = 0; 832 task->thread.fs = addr; 833 if (doit) { 834 /* set the selector to 0 to not confuse 835 __switch_to */ 836 asm volatile("movl %0,%%fs" :: "r" (0)); 837 ret = checking_wrmsrl(MSR_FS_BASE, addr); 838 } 839 } 840 put_cpu(); 841 break; 842 case ARCH_GET_FS: { 843 unsigned long base; 844 if (task->thread.fsindex == FS_TLS_SEL) 845 base = read_32bit_tls(task, FS_TLS); 846 else if (doit) 847 rdmsrl(MSR_FS_BASE, base); 848 else 849 base = task->thread.fs; 850 ret = put_user(base, (unsigned long __user *)addr); 851 break; 852 } 853 case ARCH_GET_GS: { 854 unsigned long base; 855 unsigned gsindex; 856 if (task->thread.gsindex == GS_TLS_SEL) 857 base = read_32bit_tls(task, GS_TLS); 858 else if (doit) { 859 asm("movl %%gs,%0" : "=r" (gsindex)); 860 if (gsindex) 861 rdmsrl(MSR_KERNEL_GS_BASE, base); 862 else 863 base = task->thread.gs; 864 } 865 else 866 base = task->thread.gs; 867 ret = put_user(base, (unsigned long __user *)addr); 868 break; 869 } 870 871 default: 872 ret = -EINVAL; 873 break; 874 } 875 876 return ret; 877 } 878 879 long sys_arch_prctl(int code, unsigned long addr) 880 { 881 return do_arch_prctl(current, code, addr); 882 } 883 884 /* 885 * Capture the user space registers if the task is not running (in user space) 886 */ 887 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) 888 { 889 struct pt_regs *pp, ptregs; 890 891 pp = task_pt_regs(tsk); 892 893 ptregs = *pp; 894 ptregs.cs &= 0xffff; 895 ptregs.ss &= 0xffff; 896 897 elf_core_copy_regs(regs, &ptregs); 898 899 return 1; 900 } 901 902 unsigned long arch_align_stack(unsigned long sp) 903 { 904 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 905 sp -= get_random_int() % 8192; 906 return sp & ~0xf; 907 } 908