1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13 /* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17 #include <stdarg.h> 18 19 #include <linux/cpu.h> 20 #include <linux/errno.h> 21 #include <linux/sched.h> 22 #include <linux/fs.h> 23 #include <linux/kernel.h> 24 #include <linux/mm.h> 25 #include <linux/elfcore.h> 26 #include <linux/smp.h> 27 #include <linux/slab.h> 28 #include <linux/user.h> 29 #include <linux/interrupt.h> 30 #include <linux/utsname.h> 31 #include <linux/delay.h> 32 #include <linux/module.h> 33 #include <linux/ptrace.h> 34 #include <linux/random.h> 35 #include <linux/notifier.h> 36 #include <linux/kprobes.h> 37 #include <linux/kdebug.h> 38 #include <linux/tick.h> 39 40 #include <asm/uaccess.h> 41 #include <asm/pgtable.h> 42 #include <asm/system.h> 43 #include <asm/io.h> 44 #include <asm/processor.h> 45 #include <asm/i387.h> 46 #include <asm/mmu_context.h> 47 #include <asm/pda.h> 48 #include <asm/prctl.h> 49 #include <asm/desc.h> 50 #include <asm/proto.h> 51 #include <asm/ia32.h> 52 #include <asm/idle.h> 53 54 asmlinkage extern void ret_from_fork(void); 55 56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 57 58 unsigned long boot_option_idle_override = 0; 59 EXPORT_SYMBOL(boot_option_idle_override); 60 61 /* 62 * Powermanagement idle function, if any.. 63 */ 64 void (*pm_idle)(void); 65 EXPORT_SYMBOL(pm_idle); 66 static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 67 68 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 69 70 void idle_notifier_register(struct notifier_block *n) 71 { 72 atomic_notifier_chain_register(&idle_notifier, n); 73 } 74 75 void enter_idle(void) 76 { 77 write_pda(isidle, 1); 78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 79 } 80 81 static void __exit_idle(void) 82 { 83 if (test_and_clear_bit_pda(0, isidle) == 0) 84 return; 85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 86 } 87 88 /* Called from interrupts to signify idle end */ 89 void exit_idle(void) 90 { 91 /* idle loop has pid 0 */ 92 if (current->pid) 93 return; 94 __exit_idle(); 95 } 96 97 /* 98 * We use this if we don't have any better 99 * idle routine.. 100 */ 101 void default_idle(void) 102 { 103 current_thread_info()->status &= ~TS_POLLING; 104 /* 105 * TS_POLLING-cleared state must be visible before we 106 * test NEED_RESCHED: 107 */ 108 smp_mb(); 109 local_irq_disable(); 110 if (!need_resched()) { 111 ktime_t t0, t1; 112 u64 t0n, t1n; 113 114 t0 = ktime_get(); 115 t0n = ktime_to_ns(t0); 116 safe_halt(); /* enables interrupts racelessly */ 117 local_irq_disable(); 118 t1 = ktime_get(); 119 t1n = ktime_to_ns(t1); 120 sched_clock_idle_wakeup_event(t1n - t0n); 121 } 122 local_irq_enable(); 123 current_thread_info()->status |= TS_POLLING; 124 } 125 126 /* 127 * On SMP it's slightly faster (but much more power-consuming!) 128 * to poll the ->need_resched flag instead of waiting for the 129 * cross-CPU IPI to arrive. Use this option with caution. 130 */ 131 static void poll_idle(void) 132 { 133 local_irq_enable(); 134 cpu_relax(); 135 } 136 137 #ifdef CONFIG_HOTPLUG_CPU 138 DECLARE_PER_CPU(int, cpu_state); 139 140 #include <asm/nmi.h> 141 /* We halt the CPU with physical CPU hotplug */ 142 static inline void play_dead(void) 143 { 144 idle_task_exit(); 145 wbinvd(); 146 mb(); 147 /* Ack it */ 148 __get_cpu_var(cpu_state) = CPU_DEAD; 149 150 local_irq_disable(); 151 while (1) 152 halt(); 153 } 154 #else 155 static inline void play_dead(void) 156 { 157 BUG(); 158 } 159 #endif /* CONFIG_HOTPLUG_CPU */ 160 161 /* 162 * The idle thread. There's no useful work to be 163 * done, so just try to conserve power and have a 164 * low exit latency (ie sit in a loop waiting for 165 * somebody to say that they'd like to reschedule) 166 */ 167 void cpu_idle(void) 168 { 169 current_thread_info()->status |= TS_POLLING; 170 /* endless idle loop with no priority at all */ 171 while (1) { 172 tick_nohz_stop_sched_tick(); 173 while (!need_resched()) { 174 void (*idle)(void); 175 176 if (__get_cpu_var(cpu_idle_state)) 177 __get_cpu_var(cpu_idle_state) = 0; 178 179 rmb(); 180 idle = pm_idle; 181 if (!idle) 182 idle = default_idle; 183 if (cpu_is_offline(smp_processor_id())) 184 play_dead(); 185 /* 186 * Idle routines should keep interrupts disabled 187 * from here on, until they go to idle. 188 * Otherwise, idle callbacks can misfire. 189 */ 190 local_irq_disable(); 191 enter_idle(); 192 idle(); 193 /* In many cases the interrupt that ended idle 194 has already called exit_idle. But some idle 195 loops can be woken up without interrupt. */ 196 __exit_idle(); 197 } 198 199 tick_nohz_restart_sched_tick(); 200 preempt_enable_no_resched(); 201 schedule(); 202 preempt_disable(); 203 } 204 } 205 206 static void do_nothing(void *unused) 207 { 208 } 209 210 void cpu_idle_wait(void) 211 { 212 unsigned int cpu, this_cpu = get_cpu(); 213 cpumask_t map, tmp = current->cpus_allowed; 214 215 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 216 put_cpu(); 217 218 cpus_clear(map); 219 for_each_online_cpu(cpu) { 220 per_cpu(cpu_idle_state, cpu) = 1; 221 cpu_set(cpu, map); 222 } 223 224 __get_cpu_var(cpu_idle_state) = 0; 225 226 wmb(); 227 do { 228 ssleep(1); 229 for_each_online_cpu(cpu) { 230 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) 231 cpu_clear(cpu, map); 232 } 233 cpus_and(map, map, cpu_online_map); 234 /* 235 * We waited 1 sec, if a CPU still did not call idle 236 * it may be because it is in idle and not waking up 237 * because it has nothing to do. 238 * Give all the remaining CPUS a kick. 239 */ 240 smp_call_function_mask(map, do_nothing, 0, 0); 241 } while (!cpus_empty(map)); 242 243 set_cpus_allowed(current, tmp); 244 } 245 EXPORT_SYMBOL_GPL(cpu_idle_wait); 246 247 /* 248 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 249 * which can obviate IPI to trigger checking of need_resched. 250 * We execute MONITOR against need_resched and enter optimized wait state 251 * through MWAIT. Whenever someone changes need_resched, we would be woken 252 * up from MWAIT (without an IPI). 253 * 254 * New with Core Duo processors, MWAIT can take some hints based on CPU 255 * capability. 256 */ 257 void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 258 { 259 if (!need_resched()) { 260 __monitor((void *)¤t_thread_info()->flags, 0, 0); 261 smp_mb(); 262 if (!need_resched()) 263 __mwait(ax, cx); 264 } 265 } 266 267 /* Default MONITOR/MWAIT with no hints, used for default C1 state */ 268 static void mwait_idle(void) 269 { 270 if (!need_resched()) { 271 __monitor((void *)¤t_thread_info()->flags, 0, 0); 272 smp_mb(); 273 if (!need_resched()) 274 __sti_mwait(0, 0); 275 else 276 local_irq_enable(); 277 } else { 278 local_irq_enable(); 279 } 280 } 281 282 283 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 284 { 285 if (force_mwait) 286 return 1; 287 /* Any C1 states supported? */ 288 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; 289 } 290 291 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 292 { 293 static int selected; 294 295 if (selected) 296 return; 297 #ifdef CONFIG_X86_SMP 298 if (pm_idle == poll_idle && smp_num_siblings > 1) { 299 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 300 " performance may degrade.\n"); 301 } 302 #endif 303 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { 304 /* 305 * Skip, if setup has overridden idle. 306 * One CPU supports mwait => All CPUs supports mwait 307 */ 308 if (!pm_idle) { 309 printk(KERN_INFO "using mwait in idle threads.\n"); 310 pm_idle = mwait_idle; 311 } 312 } 313 selected = 1; 314 } 315 316 static int __init idle_setup(char *str) 317 { 318 if (!strcmp(str, "poll")) { 319 printk("using polling idle threads.\n"); 320 pm_idle = poll_idle; 321 } else if (!strcmp(str, "mwait")) 322 force_mwait = 1; 323 else 324 return -1; 325 326 boot_option_idle_override = 1; 327 return 0; 328 } 329 early_param("idle", idle_setup); 330 331 /* Prints also some state that isn't saved in the pt_regs */ 332 void __show_regs(struct pt_regs * regs) 333 { 334 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 335 unsigned long d0, d1, d2, d3, d6, d7; 336 unsigned int fsindex, gsindex; 337 unsigned int ds, cs, es; 338 339 printk("\n"); 340 print_modules(); 341 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 342 current->pid, current->comm, print_tainted(), 343 init_utsname()->release, 344 (int)strcspn(init_utsname()->version, " "), 345 init_utsname()->version); 346 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 347 printk_address(regs->ip, 1); 348 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 349 regs->flags); 350 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 351 regs->ax, regs->bx, regs->cx); 352 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 353 regs->dx, regs->si, regs->di); 354 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 355 regs->bp, regs->r8, regs->r9); 356 printk("R10: %016lx R11: %016lx R12: %016lx\n", 357 regs->r10, regs->r11, regs->r12); 358 printk("R13: %016lx R14: %016lx R15: %016lx\n", 359 regs->r13, regs->r14, regs->r15); 360 361 asm("movl %%ds,%0" : "=r" (ds)); 362 asm("movl %%cs,%0" : "=r" (cs)); 363 asm("movl %%es,%0" : "=r" (es)); 364 asm("movl %%fs,%0" : "=r" (fsindex)); 365 asm("movl %%gs,%0" : "=r" (gsindex)); 366 367 rdmsrl(MSR_FS_BASE, fs); 368 rdmsrl(MSR_GS_BASE, gs); 369 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 370 371 cr0 = read_cr0(); 372 cr2 = read_cr2(); 373 cr3 = read_cr3(); 374 cr4 = read_cr4(); 375 376 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 377 fs,fsindex,gs,gsindex,shadowgs); 378 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 379 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 380 381 get_debugreg(d0, 0); 382 get_debugreg(d1, 1); 383 get_debugreg(d2, 2); 384 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 385 get_debugreg(d3, 3); 386 get_debugreg(d6, 6); 387 get_debugreg(d7, 7); 388 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 389 } 390 391 void show_regs(struct pt_regs *regs) 392 { 393 printk("CPU %d:", smp_processor_id()); 394 __show_regs(regs); 395 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 396 } 397 398 /* 399 * Free current thread data structures etc.. 400 */ 401 void exit_thread(void) 402 { 403 struct task_struct *me = current; 404 struct thread_struct *t = &me->thread; 405 406 if (me->thread.io_bitmap_ptr) { 407 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 408 409 kfree(t->io_bitmap_ptr); 410 t->io_bitmap_ptr = NULL; 411 clear_thread_flag(TIF_IO_BITMAP); 412 /* 413 * Careful, clear this in the TSS too: 414 */ 415 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 416 t->io_bitmap_max = 0; 417 put_cpu(); 418 } 419 } 420 421 void flush_thread(void) 422 { 423 struct task_struct *tsk = current; 424 425 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 426 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 427 if (test_tsk_thread_flag(tsk, TIF_IA32)) { 428 clear_tsk_thread_flag(tsk, TIF_IA32); 429 } else { 430 set_tsk_thread_flag(tsk, TIF_IA32); 431 current_thread_info()->status |= TS_COMPAT; 432 } 433 } 434 clear_tsk_thread_flag(tsk, TIF_DEBUG); 435 436 tsk->thread.debugreg0 = 0; 437 tsk->thread.debugreg1 = 0; 438 tsk->thread.debugreg2 = 0; 439 tsk->thread.debugreg3 = 0; 440 tsk->thread.debugreg6 = 0; 441 tsk->thread.debugreg7 = 0; 442 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 443 /* 444 * Forget coprocessor state.. 445 */ 446 clear_fpu(tsk); 447 clear_used_math(); 448 } 449 450 void release_thread(struct task_struct *dead_task) 451 { 452 if (dead_task->mm) { 453 if (dead_task->mm->context.size) { 454 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 455 dead_task->comm, 456 dead_task->mm->context.ldt, 457 dead_task->mm->context.size); 458 BUG(); 459 } 460 } 461 } 462 463 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 464 { 465 struct user_desc ud = { 466 .base_addr = addr, 467 .limit = 0xfffff, 468 .seg_32bit = 1, 469 .limit_in_pages = 1, 470 .useable = 1, 471 }; 472 struct desc_struct *desc = t->thread.tls_array; 473 desc += tls; 474 fill_ldt(desc, &ud); 475 } 476 477 static inline u32 read_32bit_tls(struct task_struct *t, int tls) 478 { 479 return get_desc_base(&t->thread.tls_array[tls]); 480 } 481 482 /* 483 * This gets called before we allocate a new thread and copy 484 * the current task into it. 485 */ 486 void prepare_to_copy(struct task_struct *tsk) 487 { 488 unlazy_fpu(tsk); 489 } 490 491 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 492 unsigned long unused, 493 struct task_struct * p, struct pt_regs * regs) 494 { 495 int err; 496 struct pt_regs * childregs; 497 struct task_struct *me = current; 498 499 childregs = ((struct pt_regs *) 500 (THREAD_SIZE + task_stack_page(p))) - 1; 501 *childregs = *regs; 502 503 childregs->ax = 0; 504 childregs->sp = sp; 505 if (sp == ~0UL) 506 childregs->sp = (unsigned long)childregs; 507 508 p->thread.sp = (unsigned long) childregs; 509 p->thread.sp0 = (unsigned long) (childregs+1); 510 p->thread.usersp = me->thread.usersp; 511 512 set_tsk_thread_flag(p, TIF_FORK); 513 514 p->thread.fs = me->thread.fs; 515 p->thread.gs = me->thread.gs; 516 517 asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); 518 asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); 519 asm("mov %%es,%0" : "=m" (p->thread.es)); 520 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 521 522 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 523 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 524 if (!p->thread.io_bitmap_ptr) { 525 p->thread.io_bitmap_max = 0; 526 return -ENOMEM; 527 } 528 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 529 IO_BITMAP_BYTES); 530 set_tsk_thread_flag(p, TIF_IO_BITMAP); 531 } 532 533 /* 534 * Set a new TLS for the child thread? 535 */ 536 if (clone_flags & CLONE_SETTLS) { 537 #ifdef CONFIG_IA32_EMULATION 538 if (test_thread_flag(TIF_IA32)) 539 err = do_set_thread_area(p, -1, 540 (struct user_desc __user *)childregs->si, 0); 541 else 542 #endif 543 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 544 if (err) 545 goto out; 546 } 547 err = 0; 548 out: 549 if (err && p->thread.io_bitmap_ptr) { 550 kfree(p->thread.io_bitmap_ptr); 551 p->thread.io_bitmap_max = 0; 552 } 553 return err; 554 } 555 556 /* 557 * This special macro can be used to load a debugging register 558 */ 559 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) 560 561 static inline void __switch_to_xtra(struct task_struct *prev_p, 562 struct task_struct *next_p, 563 struct tss_struct *tss) 564 { 565 struct thread_struct *prev, *next; 566 unsigned long debugctl; 567 568 prev = &prev_p->thread, 569 next = &next_p->thread; 570 571 debugctl = prev->debugctlmsr; 572 if (next->ds_area_msr != prev->ds_area_msr) { 573 /* we clear debugctl to make sure DS 574 * is not in use when we change it */ 575 debugctl = 0; 576 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); 577 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 578 } 579 580 if (next->debugctlmsr != debugctl) 581 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); 582 583 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 584 loaddebug(next, 0); 585 loaddebug(next, 1); 586 loaddebug(next, 2); 587 loaddebug(next, 3); 588 /* no 4 and 5 */ 589 loaddebug(next, 6); 590 loaddebug(next, 7); 591 } 592 593 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 594 /* 595 * Copy the relevant range of the IO bitmap. 596 * Normally this is 128 bytes or less: 597 */ 598 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 599 max(prev->io_bitmap_max, next->io_bitmap_max)); 600 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 601 /* 602 * Clear any possible leftover bits: 603 */ 604 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 605 } 606 607 #ifdef X86_BTS 608 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 609 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 610 611 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 612 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 613 #endif 614 } 615 616 /* 617 * switch_to(x,y) should switch tasks from x to y. 618 * 619 * This could still be optimized: 620 * - fold all the options into a flag word and test it with a single test. 621 * - could test fs/gs bitsliced 622 * 623 * Kprobes not supported here. Set the probe on schedule instead. 624 */ 625 struct task_struct * 626 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 627 { 628 struct thread_struct *prev = &prev_p->thread, 629 *next = &next_p->thread; 630 int cpu = smp_processor_id(); 631 struct tss_struct *tss = &per_cpu(init_tss, cpu); 632 633 /* we're going to use this soon, after a few expensive things */ 634 if (next_p->fpu_counter>5) 635 prefetch(&next->i387.fxsave); 636 637 /* 638 * Reload esp0, LDT and the page table pointer: 639 */ 640 load_sp0(tss, next); 641 642 /* 643 * Switch DS and ES. 644 * This won't pick up thread selector changes, but I guess that is ok. 645 */ 646 asm volatile("mov %%es,%0" : "=m" (prev->es)); 647 if (unlikely(next->es | prev->es)) 648 loadsegment(es, next->es); 649 650 asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); 651 if (unlikely(next->ds | prev->ds)) 652 loadsegment(ds, next->ds); 653 654 load_TLS(next, cpu); 655 656 /* 657 * Switch FS and GS. 658 */ 659 { 660 unsigned fsindex; 661 asm volatile("movl %%fs,%0" : "=r" (fsindex)); 662 /* segment register != 0 always requires a reload. 663 also reload when it has changed. 664 when prev process used 64bit base always reload 665 to avoid an information leak. */ 666 if (unlikely(fsindex | next->fsindex | prev->fs)) { 667 loadsegment(fs, next->fsindex); 668 /* check if the user used a selector != 0 669 * if yes clear 64bit base, since overloaded base 670 * is always mapped to the Null selector 671 */ 672 if (fsindex) 673 prev->fs = 0; 674 } 675 /* when next process has a 64bit base use it */ 676 if (next->fs) 677 wrmsrl(MSR_FS_BASE, next->fs); 678 prev->fsindex = fsindex; 679 } 680 { 681 unsigned gsindex; 682 asm volatile("movl %%gs,%0" : "=r" (gsindex)); 683 if (unlikely(gsindex | next->gsindex | prev->gs)) { 684 load_gs_index(next->gsindex); 685 if (gsindex) 686 prev->gs = 0; 687 } 688 if (next->gs) 689 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 690 prev->gsindex = gsindex; 691 } 692 693 /* Must be after DS reload */ 694 unlazy_fpu(prev_p); 695 696 /* 697 * Switch the PDA and FPU contexts. 698 */ 699 prev->usersp = read_pda(oldrsp); 700 write_pda(oldrsp, next->usersp); 701 write_pda(pcurrent, next_p); 702 703 write_pda(kernelstack, 704 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 705 #ifdef CONFIG_CC_STACKPROTECTOR 706 write_pda(stack_canary, next_p->stack_canary); 707 /* 708 * Build time only check to make sure the stack_canary is at 709 * offset 40 in the pda; this is a gcc ABI requirement 710 */ 711 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); 712 #endif 713 714 /* 715 * Now maybe reload the debug registers and handle I/O bitmaps 716 */ 717 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 718 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 719 __switch_to_xtra(prev_p, next_p, tss); 720 721 /* If the task has used fpu the last 5 timeslices, just do a full 722 * restore of the math state immediately to avoid the trap; the 723 * chances of needing FPU soon are obviously high now 724 */ 725 if (next_p->fpu_counter>5) 726 math_state_restore(); 727 return prev_p; 728 } 729 730 /* 731 * sys_execve() executes a new program. 732 */ 733 asmlinkage 734 long sys_execve(char __user *name, char __user * __user *argv, 735 char __user * __user *envp, struct pt_regs *regs) 736 { 737 long error; 738 char * filename; 739 740 filename = getname(name); 741 error = PTR_ERR(filename); 742 if (IS_ERR(filename)) 743 return error; 744 error = do_execve(filename, argv, envp, regs); 745 putname(filename); 746 return error; 747 } 748 749 void set_personality_64bit(void) 750 { 751 /* inherit personality from parent */ 752 753 /* Make sure to be in 64bit mode */ 754 clear_thread_flag(TIF_IA32); 755 756 /* TBD: overwrites user setup. Should have two bits. 757 But 64bit processes have always behaved this way, 758 so it's not too bad. The main problem is just that 759 32bit childs are affected again. */ 760 current->personality &= ~READ_IMPLIES_EXEC; 761 } 762 763 asmlinkage long sys_fork(struct pt_regs *regs) 764 { 765 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); 766 } 767 768 asmlinkage long 769 sys_clone(unsigned long clone_flags, unsigned long newsp, 770 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 771 { 772 if (!newsp) 773 newsp = regs->sp; 774 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 775 } 776 777 /* 778 * This is trivial, and on the face of it looks like it 779 * could equally well be done in user mode. 780 * 781 * Not so, for quite unobvious reasons - register pressure. 782 * In user mode vfork() cannot have a stack frame, and if 783 * done by calling the "clone()" system call directly, you 784 * do not have enough call-clobbered registers to hold all 785 * the information you need. 786 */ 787 asmlinkage long sys_vfork(struct pt_regs *regs) 788 { 789 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, 790 NULL, NULL); 791 } 792 793 unsigned long get_wchan(struct task_struct *p) 794 { 795 unsigned long stack; 796 u64 fp,ip; 797 int count = 0; 798 799 if (!p || p == current || p->state==TASK_RUNNING) 800 return 0; 801 stack = (unsigned long)task_stack_page(p); 802 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 803 return 0; 804 fp = *(u64 *)(p->thread.sp); 805 do { 806 if (fp < (unsigned long)stack || 807 fp > (unsigned long)stack+THREAD_SIZE) 808 return 0; 809 ip = *(u64 *)(fp+8); 810 if (!in_sched_functions(ip)) 811 return ip; 812 fp = *(u64 *)fp; 813 } while (count++ < 16); 814 return 0; 815 } 816 817 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 818 { 819 int ret = 0; 820 int doit = task == current; 821 int cpu; 822 823 switch (code) { 824 case ARCH_SET_GS: 825 if (addr >= TASK_SIZE_OF(task)) 826 return -EPERM; 827 cpu = get_cpu(); 828 /* handle small bases via the GDT because that's faster to 829 switch. */ 830 if (addr <= 0xffffffff) { 831 set_32bit_tls(task, GS_TLS, addr); 832 if (doit) { 833 load_TLS(&task->thread, cpu); 834 load_gs_index(GS_TLS_SEL); 835 } 836 task->thread.gsindex = GS_TLS_SEL; 837 task->thread.gs = 0; 838 } else { 839 task->thread.gsindex = 0; 840 task->thread.gs = addr; 841 if (doit) { 842 load_gs_index(0); 843 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 844 } 845 } 846 put_cpu(); 847 break; 848 case ARCH_SET_FS: 849 /* Not strictly needed for fs, but do it for symmetry 850 with gs */ 851 if (addr >= TASK_SIZE_OF(task)) 852 return -EPERM; 853 cpu = get_cpu(); 854 /* handle small bases via the GDT because that's faster to 855 switch. */ 856 if (addr <= 0xffffffff) { 857 set_32bit_tls(task, FS_TLS, addr); 858 if (doit) { 859 load_TLS(&task->thread, cpu); 860 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 861 } 862 task->thread.fsindex = FS_TLS_SEL; 863 task->thread.fs = 0; 864 } else { 865 task->thread.fsindex = 0; 866 task->thread.fs = addr; 867 if (doit) { 868 /* set the selector to 0 to not confuse 869 __switch_to */ 870 asm volatile("movl %0,%%fs" :: "r" (0)); 871 ret = checking_wrmsrl(MSR_FS_BASE, addr); 872 } 873 } 874 put_cpu(); 875 break; 876 case ARCH_GET_FS: { 877 unsigned long base; 878 if (task->thread.fsindex == FS_TLS_SEL) 879 base = read_32bit_tls(task, FS_TLS); 880 else if (doit) 881 rdmsrl(MSR_FS_BASE, base); 882 else 883 base = task->thread.fs; 884 ret = put_user(base, (unsigned long __user *)addr); 885 break; 886 } 887 case ARCH_GET_GS: { 888 unsigned long base; 889 unsigned gsindex; 890 if (task->thread.gsindex == GS_TLS_SEL) 891 base = read_32bit_tls(task, GS_TLS); 892 else if (doit) { 893 asm("movl %%gs,%0" : "=r" (gsindex)); 894 if (gsindex) 895 rdmsrl(MSR_KERNEL_GS_BASE, base); 896 else 897 base = task->thread.gs; 898 } 899 else 900 base = task->thread.gs; 901 ret = put_user(base, (unsigned long __user *)addr); 902 break; 903 } 904 905 default: 906 ret = -EINVAL; 907 break; 908 } 909 910 return ret; 911 } 912 913 long sys_arch_prctl(int code, unsigned long addr) 914 { 915 return do_arch_prctl(current, code, addr); 916 } 917 918 unsigned long arch_align_stack(unsigned long sp) 919 { 920 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 921 sp -= get_random_int() % 8192; 922 return sp & ~0xf; 923 } 924 925 unsigned long arch_randomize_brk(struct mm_struct *mm) 926 { 927 unsigned long range_end = mm->brk + 0x02000000; 928 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 929 } 930