1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13 /* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17 #include <linux/stackprotector.h> 18 #include <linux/cpu.h> 19 #include <linux/errno.h> 20 #include <linux/sched.h> 21 #include <linux/fs.h> 22 #include <linux/kernel.h> 23 #include <linux/mm.h> 24 #include <linux/elfcore.h> 25 #include <linux/smp.h> 26 #include <linux/slab.h> 27 #include <linux/user.h> 28 #include <linux/interrupt.h> 29 #include <linux/delay.h> 30 #include <linux/module.h> 31 #include <linux/ptrace.h> 32 #include <linux/notifier.h> 33 #include <linux/kprobes.h> 34 #include <linux/kdebug.h> 35 #include <linux/tick.h> 36 #include <linux/prctl.h> 37 #include <linux/uaccess.h> 38 #include <linux/io.h> 39 #include <linux/ftrace.h> 40 41 #include <asm/pgtable.h> 42 #include <asm/system.h> 43 #include <asm/processor.h> 44 #include <asm/i387.h> 45 #include <asm/mmu_context.h> 46 #include <asm/prctl.h> 47 #include <asm/desc.h> 48 #include <asm/proto.h> 49 #include <asm/ia32.h> 50 #include <asm/idle.h> 51 #include <asm/syscalls.h> 52 #include <asm/ds.h> 53 #include <asm/debugreg.h> 54 55 asmlinkage extern void ret_from_fork(void); 56 57 DEFINE_PER_CPU(unsigned long, old_rsp); 58 static DEFINE_PER_CPU(unsigned char, is_idle); 59 60 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 61 62 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 63 64 void idle_notifier_register(struct notifier_block *n) 65 { 66 atomic_notifier_chain_register(&idle_notifier, n); 67 } 68 EXPORT_SYMBOL_GPL(idle_notifier_register); 69 70 void idle_notifier_unregister(struct notifier_block *n) 71 { 72 atomic_notifier_chain_unregister(&idle_notifier, n); 73 } 74 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 75 76 void enter_idle(void) 77 { 78 percpu_write(is_idle, 1); 79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 80 } 81 82 static void __exit_idle(void) 83 { 84 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 85 return; 86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 87 } 88 89 /* Called from interrupts to signify idle end */ 90 void exit_idle(void) 91 { 92 /* idle loop has pid 0 */ 93 if (current->pid) 94 return; 95 __exit_idle(); 96 } 97 98 #ifndef CONFIG_SMP 99 static inline void play_dead(void) 100 { 101 BUG(); 102 } 103 #endif 104 105 /* 106 * The idle thread. There's no useful work to be 107 * done, so just try to conserve power and have a 108 * low exit latency (ie sit in a loop waiting for 109 * somebody to say that they'd like to reschedule) 110 */ 111 void cpu_idle(void) 112 { 113 current_thread_info()->status |= TS_POLLING; 114 115 /* 116 * If we're the non-boot CPU, nothing set the stack canary up 117 * for us. CPU0 already has it initialized but no harm in 118 * doing it again. This is a good place for updating it, as 119 * we wont ever return from this function (so the invalid 120 * canaries already on the stack wont ever trigger). 121 */ 122 boot_init_stack_canary(); 123 124 /* endless idle loop with no priority at all */ 125 while (1) { 126 tick_nohz_stop_sched_tick(1); 127 while (!need_resched()) { 128 129 rmb(); 130 131 if (cpu_is_offline(smp_processor_id())) 132 play_dead(); 133 /* 134 * Idle routines should keep interrupts disabled 135 * from here on, until they go to idle. 136 * Otherwise, idle callbacks can misfire. 137 */ 138 local_irq_disable(); 139 enter_idle(); 140 /* Don't trace irqs off for idle */ 141 stop_critical_timings(); 142 pm_idle(); 143 start_critical_timings(); 144 /* In many cases the interrupt that ended idle 145 has already called exit_idle. But some idle 146 loops can be woken up without interrupt. */ 147 __exit_idle(); 148 } 149 150 tick_nohz_restart_sched_tick(); 151 preempt_enable_no_resched(); 152 schedule(); 153 preempt_disable(); 154 } 155 } 156 157 /* Prints also some state that isn't saved in the pt_regs */ 158 void __show_regs(struct pt_regs *regs, int all) 159 { 160 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 161 unsigned long d0, d1, d2, d3, d6, d7; 162 unsigned int fsindex, gsindex; 163 unsigned int ds, cs, es; 164 165 show_regs_common(); 166 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 167 printk_address(regs->ip, 1); 168 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 169 regs->sp, regs->flags); 170 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 171 regs->ax, regs->bx, regs->cx); 172 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 173 regs->dx, regs->si, regs->di); 174 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 175 regs->bp, regs->r8, regs->r9); 176 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 177 regs->r10, regs->r11, regs->r12); 178 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 179 regs->r13, regs->r14, regs->r15); 180 181 asm("movl %%ds,%0" : "=r" (ds)); 182 asm("movl %%cs,%0" : "=r" (cs)); 183 asm("movl %%es,%0" : "=r" (es)); 184 asm("movl %%fs,%0" : "=r" (fsindex)); 185 asm("movl %%gs,%0" : "=r" (gsindex)); 186 187 rdmsrl(MSR_FS_BASE, fs); 188 rdmsrl(MSR_GS_BASE, gs); 189 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 190 191 if (!all) 192 return; 193 194 cr0 = read_cr0(); 195 cr2 = read_cr2(); 196 cr3 = read_cr3(); 197 cr4 = read_cr4(); 198 199 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 200 fs, fsindex, gs, gsindex, shadowgs); 201 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 202 es, cr0); 203 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 204 cr4); 205 206 get_debugreg(d0, 0); 207 get_debugreg(d1, 1); 208 get_debugreg(d2, 2); 209 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 210 get_debugreg(d3, 3); 211 get_debugreg(d6, 6); 212 get_debugreg(d7, 7); 213 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 214 } 215 216 void show_regs(struct pt_regs *regs) 217 { 218 show_registers(regs); 219 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 220 } 221 222 void release_thread(struct task_struct *dead_task) 223 { 224 if (dead_task->mm) { 225 if (dead_task->mm->context.size) { 226 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 227 dead_task->comm, 228 dead_task->mm->context.ldt, 229 dead_task->mm->context.size); 230 BUG(); 231 } 232 } 233 } 234 235 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 236 { 237 struct user_desc ud = { 238 .base_addr = addr, 239 .limit = 0xfffff, 240 .seg_32bit = 1, 241 .limit_in_pages = 1, 242 .useable = 1, 243 }; 244 struct desc_struct *desc = t->thread.tls_array; 245 desc += tls; 246 fill_ldt(desc, &ud); 247 } 248 249 static inline u32 read_32bit_tls(struct task_struct *t, int tls) 250 { 251 return get_desc_base(&t->thread.tls_array[tls]); 252 } 253 254 /* 255 * This gets called before we allocate a new thread and copy 256 * the current task into it. 257 */ 258 void prepare_to_copy(struct task_struct *tsk) 259 { 260 unlazy_fpu(tsk); 261 } 262 263 int copy_thread(unsigned long clone_flags, unsigned long sp, 264 unsigned long unused, 265 struct task_struct *p, struct pt_regs *regs) 266 { 267 int err; 268 struct pt_regs *childregs; 269 struct task_struct *me = current; 270 271 childregs = ((struct pt_regs *) 272 (THREAD_SIZE + task_stack_page(p))) - 1; 273 *childregs = *regs; 274 275 childregs->ax = 0; 276 childregs->sp = sp; 277 if (sp == ~0UL) 278 childregs->sp = (unsigned long)childregs; 279 280 p->thread.sp = (unsigned long) childregs; 281 p->thread.sp0 = (unsigned long) (childregs+1); 282 p->thread.usersp = me->thread.usersp; 283 284 set_tsk_thread_flag(p, TIF_FORK); 285 286 p->thread.fs = me->thread.fs; 287 p->thread.gs = me->thread.gs; 288 p->thread.io_bitmap_ptr = NULL; 289 290 savesegment(gs, p->thread.gsindex); 291 savesegment(fs, p->thread.fsindex); 292 savesegment(es, p->thread.es); 293 savesegment(ds, p->thread.ds); 294 295 err = -ENOMEM; 296 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 297 298 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 299 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 300 if (!p->thread.io_bitmap_ptr) { 301 p->thread.io_bitmap_max = 0; 302 return -ENOMEM; 303 } 304 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 305 IO_BITMAP_BYTES); 306 set_tsk_thread_flag(p, TIF_IO_BITMAP); 307 } 308 309 /* 310 * Set a new TLS for the child thread? 311 */ 312 if (clone_flags & CLONE_SETTLS) { 313 #ifdef CONFIG_IA32_EMULATION 314 if (test_thread_flag(TIF_IA32)) 315 err = do_set_thread_area(p, -1, 316 (struct user_desc __user *)childregs->si, 0); 317 else 318 #endif 319 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 320 if (err) 321 goto out; 322 } 323 324 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); 325 p->thread.ds_ctx = NULL; 326 327 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 328 p->thread.debugctlmsr = 0; 329 330 err = 0; 331 out: 332 if (err && p->thread.io_bitmap_ptr) { 333 kfree(p->thread.io_bitmap_ptr); 334 p->thread.io_bitmap_max = 0; 335 } 336 337 return err; 338 } 339 340 static void 341 start_thread_common(struct pt_regs *regs, unsigned long new_ip, 342 unsigned long new_sp, 343 unsigned int _cs, unsigned int _ss, unsigned int _ds) 344 { 345 loadsegment(fs, 0); 346 loadsegment(es, _ds); 347 loadsegment(ds, _ds); 348 load_gs_index(0); 349 regs->ip = new_ip; 350 regs->sp = new_sp; 351 percpu_write(old_rsp, new_sp); 352 regs->cs = _cs; 353 regs->ss = _ss; 354 regs->flags = X86_EFLAGS_IF; 355 set_fs(USER_DS); 356 /* 357 * Free the old FP and other extended state 358 */ 359 free_thread_xstate(current); 360 } 361 362 void 363 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 364 { 365 start_thread_common(regs, new_ip, new_sp, 366 __USER_CS, __USER_DS, 0); 367 } 368 369 #ifdef CONFIG_IA32_EMULATION 370 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 371 { 372 start_thread_common(regs, new_ip, new_sp, 373 __USER32_CS, __USER32_DS, __USER32_DS); 374 } 375 #endif 376 377 /* 378 * switch_to(x,y) should switch tasks from x to y. 379 * 380 * This could still be optimized: 381 * - fold all the options into a flag word and test it with a single test. 382 * - could test fs/gs bitsliced 383 * 384 * Kprobes not supported here. Set the probe on schedule instead. 385 * Function graph tracer not supported too. 386 */ 387 __notrace_funcgraph struct task_struct * 388 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 389 { 390 struct thread_struct *prev = &prev_p->thread; 391 struct thread_struct *next = &next_p->thread; 392 int cpu = smp_processor_id(); 393 struct tss_struct *tss = &per_cpu(init_tss, cpu); 394 unsigned fsindex, gsindex; 395 bool preload_fpu; 396 397 /* 398 * If the task has used fpu the last 5 timeslices, just do a full 399 * restore of the math state immediately to avoid the trap; the 400 * chances of needing FPU soon are obviously high now 401 */ 402 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; 403 404 /* we're going to use this soon, after a few expensive things */ 405 if (preload_fpu) 406 prefetch(next->xstate); 407 408 /* 409 * Reload esp0, LDT and the page table pointer: 410 */ 411 load_sp0(tss, next); 412 413 /* 414 * Switch DS and ES. 415 * This won't pick up thread selector changes, but I guess that is ok. 416 */ 417 savesegment(es, prev->es); 418 if (unlikely(next->es | prev->es)) 419 loadsegment(es, next->es); 420 421 savesegment(ds, prev->ds); 422 if (unlikely(next->ds | prev->ds)) 423 loadsegment(ds, next->ds); 424 425 426 /* We must save %fs and %gs before load_TLS() because 427 * %fs and %gs may be cleared by load_TLS(). 428 * 429 * (e.g. xen_load_tls()) 430 */ 431 savesegment(fs, fsindex); 432 savesegment(gs, gsindex); 433 434 load_TLS(next, cpu); 435 436 /* Must be after DS reload */ 437 unlazy_fpu(prev_p); 438 439 /* Make sure cpu is ready for new context */ 440 if (preload_fpu) 441 clts(); 442 443 /* 444 * Leave lazy mode, flushing any hypercalls made here. 445 * This must be done before restoring TLS segments so 446 * the GDT and LDT are properly updated, and must be 447 * done before math_state_restore, so the TS bit is up 448 * to date. 449 */ 450 arch_end_context_switch(next_p); 451 452 /* 453 * Switch FS and GS. 454 * 455 * Segment register != 0 always requires a reload. Also 456 * reload when it has changed. When prev process used 64bit 457 * base always reload to avoid an information leak. 458 */ 459 if (unlikely(fsindex | next->fsindex | prev->fs)) { 460 loadsegment(fs, next->fsindex); 461 /* 462 * Check if the user used a selector != 0; if yes 463 * clear 64bit base, since overloaded base is always 464 * mapped to the Null selector 465 */ 466 if (fsindex) 467 prev->fs = 0; 468 } 469 /* when next process has a 64bit base use it */ 470 if (next->fs) 471 wrmsrl(MSR_FS_BASE, next->fs); 472 prev->fsindex = fsindex; 473 474 if (unlikely(gsindex | next->gsindex | prev->gs)) { 475 load_gs_index(next->gsindex); 476 if (gsindex) 477 prev->gs = 0; 478 } 479 if (next->gs) 480 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 481 prev->gsindex = gsindex; 482 483 /* 484 * Switch the PDA and FPU contexts. 485 */ 486 prev->usersp = percpu_read(old_rsp); 487 percpu_write(old_rsp, next->usersp); 488 percpu_write(current_task, next_p); 489 490 percpu_write(kernel_stack, 491 (unsigned long)task_stack_page(next_p) + 492 THREAD_SIZE - KERNEL_STACK_OFFSET); 493 494 /* 495 * Now maybe reload the debug registers and handle I/O bitmaps 496 */ 497 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 498 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 499 __switch_to_xtra(prev_p, next_p, tss); 500 501 /* 502 * Preload the FPU context, now that we've determined that the 503 * task is likely to be using it. 504 */ 505 if (preload_fpu) 506 __math_state_restore(); 507 508 return prev_p; 509 } 510 511 /* 512 * sys_execve() executes a new program. 513 */ 514 asmlinkage 515 long sys_execve(char __user *name, char __user * __user *argv, 516 char __user * __user *envp, struct pt_regs *regs) 517 { 518 long error; 519 char *filename; 520 521 filename = getname(name); 522 error = PTR_ERR(filename); 523 if (IS_ERR(filename)) 524 return error; 525 error = do_execve(filename, argv, envp, regs); 526 putname(filename); 527 return error; 528 } 529 530 void set_personality_64bit(void) 531 { 532 /* inherit personality from parent */ 533 534 /* Make sure to be in 64bit mode */ 535 clear_thread_flag(TIF_IA32); 536 537 /* TBD: overwrites user setup. Should have two bits. 538 But 64bit processes have always behaved this way, 539 so it's not too bad. The main problem is just that 540 32bit childs are affected again. */ 541 current->personality &= ~READ_IMPLIES_EXEC; 542 } 543 544 asmlinkage long 545 sys_clone(unsigned long clone_flags, unsigned long newsp, 546 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 547 { 548 if (!newsp) 549 newsp = regs->sp; 550 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 551 } 552 553 unsigned long get_wchan(struct task_struct *p) 554 { 555 unsigned long stack; 556 u64 fp, ip; 557 int count = 0; 558 559 if (!p || p == current || p->state == TASK_RUNNING) 560 return 0; 561 stack = (unsigned long)task_stack_page(p); 562 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) 563 return 0; 564 fp = *(u64 *)(p->thread.sp); 565 do { 566 if (fp < (unsigned long)stack || 567 fp >= (unsigned long)stack+THREAD_SIZE) 568 return 0; 569 ip = *(u64 *)(fp+8); 570 if (!in_sched_functions(ip)) 571 return ip; 572 fp = *(u64 *)fp; 573 } while (count++ < 16); 574 return 0; 575 } 576 577 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 578 { 579 int ret = 0; 580 int doit = task == current; 581 int cpu; 582 583 switch (code) { 584 case ARCH_SET_GS: 585 if (addr >= TASK_SIZE_OF(task)) 586 return -EPERM; 587 cpu = get_cpu(); 588 /* handle small bases via the GDT because that's faster to 589 switch. */ 590 if (addr <= 0xffffffff) { 591 set_32bit_tls(task, GS_TLS, addr); 592 if (doit) { 593 load_TLS(&task->thread, cpu); 594 load_gs_index(GS_TLS_SEL); 595 } 596 task->thread.gsindex = GS_TLS_SEL; 597 task->thread.gs = 0; 598 } else { 599 task->thread.gsindex = 0; 600 task->thread.gs = addr; 601 if (doit) { 602 load_gs_index(0); 603 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 604 } 605 } 606 put_cpu(); 607 break; 608 case ARCH_SET_FS: 609 /* Not strictly needed for fs, but do it for symmetry 610 with gs */ 611 if (addr >= TASK_SIZE_OF(task)) 612 return -EPERM; 613 cpu = get_cpu(); 614 /* handle small bases via the GDT because that's faster to 615 switch. */ 616 if (addr <= 0xffffffff) { 617 set_32bit_tls(task, FS_TLS, addr); 618 if (doit) { 619 load_TLS(&task->thread, cpu); 620 loadsegment(fs, FS_TLS_SEL); 621 } 622 task->thread.fsindex = FS_TLS_SEL; 623 task->thread.fs = 0; 624 } else { 625 task->thread.fsindex = 0; 626 task->thread.fs = addr; 627 if (doit) { 628 /* set the selector to 0 to not confuse 629 __switch_to */ 630 loadsegment(fs, 0); 631 ret = checking_wrmsrl(MSR_FS_BASE, addr); 632 } 633 } 634 put_cpu(); 635 break; 636 case ARCH_GET_FS: { 637 unsigned long base; 638 if (task->thread.fsindex == FS_TLS_SEL) 639 base = read_32bit_tls(task, FS_TLS); 640 else if (doit) 641 rdmsrl(MSR_FS_BASE, base); 642 else 643 base = task->thread.fs; 644 ret = put_user(base, (unsigned long __user *)addr); 645 break; 646 } 647 case ARCH_GET_GS: { 648 unsigned long base; 649 unsigned gsindex; 650 if (task->thread.gsindex == GS_TLS_SEL) 651 base = read_32bit_tls(task, GS_TLS); 652 else if (doit) { 653 savesegment(gs, gsindex); 654 if (gsindex) 655 rdmsrl(MSR_KERNEL_GS_BASE, base); 656 else 657 base = task->thread.gs; 658 } else 659 base = task->thread.gs; 660 ret = put_user(base, (unsigned long __user *)addr); 661 break; 662 } 663 664 default: 665 ret = -EINVAL; 666 break; 667 } 668 669 return ret; 670 } 671 672 long sys_arch_prctl(int code, unsigned long addr) 673 { 674 return do_arch_prctl(current, code, addr); 675 } 676 677 unsigned long KSTK_ESP(struct task_struct *task) 678 { 679 return (test_tsk_thread_flag(task, TIF_IA32)) ? 680 (task_pt_regs(task)->sp) : ((task)->thread.usersp); 681 } 682