1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1995 Linus Torvalds 4 * 5 * Pentium III FXSR, SSE support 6 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * 8 * X86-64 port 9 * Andi Kleen. 10 * 11 * CPU hotplug support - ashok.raj@intel.com 12 */ 13 14 /* 15 * This file handles the architecture-dependent parts of process handling.. 16 */ 17 18 #include <linux/cpu.h> 19 #include <linux/errno.h> 20 #include <linux/sched.h> 21 #include <linux/sched/task.h> 22 #include <linux/sched/task_stack.h> 23 #include <linux/fs.h> 24 #include <linux/kernel.h> 25 #include <linux/mm.h> 26 #include <linux/elfcore.h> 27 #include <linux/smp.h> 28 #include <linux/slab.h> 29 #include <linux/user.h> 30 #include <linux/interrupt.h> 31 #include <linux/delay.h> 32 #include <linux/export.h> 33 #include <linux/ptrace.h> 34 #include <linux/notifier.h> 35 #include <linux/kprobes.h> 36 #include <linux/kdebug.h> 37 #include <linux/prctl.h> 38 #include <linux/uaccess.h> 39 #include <linux/io.h> 40 #include <linux/ftrace.h> 41 #include <linux/syscalls.h> 42 43 #include <asm/pgtable.h> 44 #include <asm/processor.h> 45 #include <asm/fpu/internal.h> 46 #include <asm/mmu_context.h> 47 #include <asm/prctl.h> 48 #include <asm/desc.h> 49 #include <asm/proto.h> 50 #include <asm/ia32.h> 51 #include <asm/syscalls.h> 52 #include <asm/debugreg.h> 53 #include <asm/switch_to.h> 54 #include <asm/xen/hypervisor.h> 55 #include <asm/vdso.h> 56 #include <asm/resctrl_sched.h> 57 #include <asm/unistd.h> 58 #include <asm/fsgsbase.h> 59 #ifdef CONFIG_IA32_EMULATION 60 /* Not included via unistd.h */ 61 #include <asm/unistd_32_ia32.h> 62 #endif 63 64 #include "process.h" 65 66 /* Prints also some state that isn't saved in the pt_regs */ 67 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) 68 { 69 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 70 unsigned long d0, d1, d2, d3, d6, d7; 71 unsigned int fsindex, gsindex; 72 unsigned int ds, es; 73 74 show_iret_regs(regs); 75 76 if (regs->orig_ax != -1) 77 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); 78 else 79 pr_cont("\n"); 80 81 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", 82 regs->ax, regs->bx, regs->cx); 83 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", 84 regs->dx, regs->si, regs->di); 85 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", 86 regs->bp, regs->r8, regs->r9); 87 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", 88 regs->r10, regs->r11, regs->r12); 89 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", 90 regs->r13, regs->r14, regs->r15); 91 92 if (mode == SHOW_REGS_SHORT) 93 return; 94 95 if (mode == SHOW_REGS_USER) { 96 rdmsrl(MSR_FS_BASE, fs); 97 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 98 printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", 99 fs, shadowgs); 100 return; 101 } 102 103 asm("movl %%ds,%0" : "=r" (ds)); 104 asm("movl %%es,%0" : "=r" (es)); 105 asm("movl %%fs,%0" : "=r" (fsindex)); 106 asm("movl %%gs,%0" : "=r" (gsindex)); 107 108 rdmsrl(MSR_FS_BASE, fs); 109 rdmsrl(MSR_GS_BASE, gs); 110 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 111 112 cr0 = read_cr0(); 113 cr2 = read_cr2(); 114 cr3 = __read_cr3(); 115 cr4 = __read_cr4(); 116 117 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 118 fs, fsindex, gs, gsindex, shadowgs); 119 printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, 120 es, cr0); 121 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 122 cr4); 123 124 get_debugreg(d0, 0); 125 get_debugreg(d1, 1); 126 get_debugreg(d2, 2); 127 get_debugreg(d3, 3); 128 get_debugreg(d6, 6); 129 get_debugreg(d7, 7); 130 131 /* Only print out debug registers if they are in their non-default state. */ 132 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 133 (d6 == DR6_RESERVED) && (d7 == 0x400))) { 134 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", 135 d0, d1, d2); 136 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", 137 d3, d6, d7); 138 } 139 140 if (boot_cpu_has(X86_FEATURE_OSPKE)) 141 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); 142 } 143 144 void release_thread(struct task_struct *dead_task) 145 { 146 if (dead_task->mm) { 147 #ifdef CONFIG_MODIFY_LDT_SYSCALL 148 if (dead_task->mm->context.ldt) { 149 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 150 dead_task->comm, 151 dead_task->mm->context.ldt->entries, 152 dead_task->mm->context.ldt->nr_entries); 153 BUG(); 154 } 155 #endif 156 } 157 } 158 159 enum which_selector { 160 FS, 161 GS 162 }; 163 164 /* 165 * Out of line to be protected from kprobes. It is not used on Xen 166 * paravirt. When paravirt support is needed, it needs to be renamed 167 * with native_ prefix. 168 */ 169 static noinline unsigned long __rdgsbase_inactive(void) 170 { 171 unsigned long gsbase; 172 173 lockdep_assert_irqs_disabled(); 174 175 native_swapgs(); 176 gsbase = rdgsbase(); 177 native_swapgs(); 178 179 return gsbase; 180 } 181 NOKPROBE_SYMBOL(__rdgsbase_inactive); 182 183 /* 184 * Out of line to be protected from kprobes. It is not used on Xen 185 * paravirt. When paravirt support is needed, it needs to be renamed 186 * with native_ prefix. 187 */ 188 static noinline void __wrgsbase_inactive(unsigned long gsbase) 189 { 190 lockdep_assert_irqs_disabled(); 191 192 native_swapgs(); 193 wrgsbase(gsbase); 194 native_swapgs(); 195 } 196 NOKPROBE_SYMBOL(__wrgsbase_inactive); 197 198 /* 199 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are 200 * not available. The goal is to be reasonably fast on non-FSGSBASE systems. 201 * It's forcibly inlined because it'll generate better code and this function 202 * is hot. 203 */ 204 static __always_inline void save_base_legacy(struct task_struct *prev_p, 205 unsigned short selector, 206 enum which_selector which) 207 { 208 if (likely(selector == 0)) { 209 /* 210 * On Intel (without X86_BUG_NULL_SEG), the segment base could 211 * be the pre-existing saved base or it could be zero. On AMD 212 * (with X86_BUG_NULL_SEG), the segment base could be almost 213 * anything. 214 * 215 * This branch is very hot (it's hit twice on almost every 216 * context switch between 64-bit programs), and avoiding 217 * the RDMSR helps a lot, so we just assume that whatever 218 * value is already saved is correct. This matches historical 219 * Linux behavior, so it won't break existing applications. 220 * 221 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we 222 * report that the base is zero, it needs to actually be zero: 223 * see the corresponding logic in load_seg_legacy. 224 */ 225 } else { 226 /* 227 * If the selector is 1, 2, or 3, then the base is zero on 228 * !X86_BUG_NULL_SEG CPUs and could be anything on 229 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux 230 * has never attempted to preserve the base across context 231 * switches. 232 * 233 * If selector > 3, then it refers to a real segment, and 234 * saving the base isn't necessary. 235 */ 236 if (which == FS) 237 prev_p->thread.fsbase = 0; 238 else 239 prev_p->thread.gsbase = 0; 240 } 241 } 242 243 static __always_inline void save_fsgs(struct task_struct *task) 244 { 245 savesegment(fs, task->thread.fsindex); 246 savesegment(gs, task->thread.gsindex); 247 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 248 unsigned long flags; 249 250 /* 251 * If FSGSBASE is enabled, we can't make any useful guesses 252 * about the base, and user code expects us to save the current 253 * value. Fortunately, reading the base directly is efficient. 254 */ 255 task->thread.fsbase = rdfsbase(); 256 local_irq_save(flags); 257 task->thread.gsbase = __rdgsbase_inactive(); 258 local_irq_restore(flags); 259 } else { 260 save_base_legacy(task, task->thread.fsindex, FS); 261 save_base_legacy(task, task->thread.gsindex, GS); 262 } 263 } 264 265 #if IS_ENABLED(CONFIG_KVM) 266 /* 267 * While a process is running,current->thread.fsbase and current->thread.gsbase 268 * may not match the corresponding CPU registers (see save_base_legacy()). KVM 269 * wants an efficient way to save and restore FSBASE and GSBASE. 270 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. 271 */ 272 void save_fsgs_for_kvm(void) 273 { 274 save_fsgs(current); 275 } 276 EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); 277 #endif 278 279 static __always_inline void loadseg(enum which_selector which, 280 unsigned short sel) 281 { 282 if (which == FS) 283 loadsegment(fs, sel); 284 else 285 load_gs_index(sel); 286 } 287 288 static __always_inline void load_seg_legacy(unsigned short prev_index, 289 unsigned long prev_base, 290 unsigned short next_index, 291 unsigned long next_base, 292 enum which_selector which) 293 { 294 if (likely(next_index <= 3)) { 295 /* 296 * The next task is using 64-bit TLS, is not using this 297 * segment at all, or is having fun with arcane CPU features. 298 */ 299 if (next_base == 0) { 300 /* 301 * Nasty case: on AMD CPUs, we need to forcibly zero 302 * the base. 303 */ 304 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { 305 loadseg(which, __USER_DS); 306 loadseg(which, next_index); 307 } else { 308 /* 309 * We could try to exhaustively detect cases 310 * under which we can skip the segment load, 311 * but there's really only one case that matters 312 * for performance: if both the previous and 313 * next states are fully zeroed, we can skip 314 * the load. 315 * 316 * (This assumes that prev_base == 0 has no 317 * false positives. This is the case on 318 * Intel-style CPUs.) 319 */ 320 if (likely(prev_index | next_index | prev_base)) 321 loadseg(which, next_index); 322 } 323 } else { 324 if (prev_index != next_index) 325 loadseg(which, next_index); 326 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, 327 next_base); 328 } 329 } else { 330 /* 331 * The next task is using a real segment. Loading the selector 332 * is sufficient. 333 */ 334 loadseg(which, next_index); 335 } 336 } 337 338 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, 339 struct thread_struct *next) 340 { 341 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 342 /* Update the FS and GS selectors if they could have changed. */ 343 if (unlikely(prev->fsindex || next->fsindex)) 344 loadseg(FS, next->fsindex); 345 if (unlikely(prev->gsindex || next->gsindex)) 346 loadseg(GS, next->gsindex); 347 348 /* Update the bases. */ 349 wrfsbase(next->fsbase); 350 __wrgsbase_inactive(next->gsbase); 351 } else { 352 load_seg_legacy(prev->fsindex, prev->fsbase, 353 next->fsindex, next->fsbase, FS); 354 load_seg_legacy(prev->gsindex, prev->gsbase, 355 next->gsindex, next->gsbase, GS); 356 } 357 } 358 359 static unsigned long x86_fsgsbase_read_task(struct task_struct *task, 360 unsigned short selector) 361 { 362 unsigned short idx = selector >> 3; 363 unsigned long base; 364 365 if (likely((selector & SEGMENT_TI_MASK) == 0)) { 366 if (unlikely(idx >= GDT_ENTRIES)) 367 return 0; 368 369 /* 370 * There are no user segments in the GDT with nonzero bases 371 * other than the TLS segments. 372 */ 373 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 374 return 0; 375 376 idx -= GDT_ENTRY_TLS_MIN; 377 base = get_desc_base(&task->thread.tls_array[idx]); 378 } else { 379 #ifdef CONFIG_MODIFY_LDT_SYSCALL 380 struct ldt_struct *ldt; 381 382 /* 383 * If performance here mattered, we could protect the LDT 384 * with RCU. This is a slow path, though, so we can just 385 * take the mutex. 386 */ 387 mutex_lock(&task->mm->context.lock); 388 ldt = task->mm->context.ldt; 389 if (unlikely(idx >= ldt->nr_entries)) 390 base = 0; 391 else 392 base = get_desc_base(ldt->entries + idx); 393 mutex_unlock(&task->mm->context.lock); 394 #else 395 base = 0; 396 #endif 397 } 398 399 return base; 400 } 401 402 unsigned long x86_gsbase_read_cpu_inactive(void) 403 { 404 unsigned long gsbase; 405 406 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 407 unsigned long flags; 408 409 /* Interrupts are disabled here. */ 410 local_irq_save(flags); 411 gsbase = __rdgsbase_inactive(); 412 local_irq_restore(flags); 413 } else { 414 rdmsrl(MSR_KERNEL_GS_BASE, gsbase); 415 } 416 417 return gsbase; 418 } 419 420 void x86_gsbase_write_cpu_inactive(unsigned long gsbase) 421 { 422 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 423 unsigned long flags; 424 425 /* Interrupts are disabled here. */ 426 local_irq_save(flags); 427 __wrgsbase_inactive(gsbase); 428 local_irq_restore(flags); 429 } else { 430 wrmsrl(MSR_KERNEL_GS_BASE, gsbase); 431 } 432 } 433 434 unsigned long x86_fsbase_read_task(struct task_struct *task) 435 { 436 unsigned long fsbase; 437 438 if (task == current) 439 fsbase = x86_fsbase_read_cpu(); 440 else if (static_cpu_has(X86_FEATURE_FSGSBASE) || 441 (task->thread.fsindex == 0)) 442 fsbase = task->thread.fsbase; 443 else 444 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); 445 446 return fsbase; 447 } 448 449 unsigned long x86_gsbase_read_task(struct task_struct *task) 450 { 451 unsigned long gsbase; 452 453 if (task == current) 454 gsbase = x86_gsbase_read_cpu_inactive(); 455 else if (static_cpu_has(X86_FEATURE_FSGSBASE) || 456 (task->thread.gsindex == 0)) 457 gsbase = task->thread.gsbase; 458 else 459 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); 460 461 return gsbase; 462 } 463 464 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) 465 { 466 WARN_ON_ONCE(task == current); 467 468 task->thread.fsbase = fsbase; 469 } 470 471 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) 472 { 473 WARN_ON_ONCE(task == current); 474 475 task->thread.gsbase = gsbase; 476 } 477 478 int copy_thread_tls(unsigned long clone_flags, unsigned long sp, 479 unsigned long arg, struct task_struct *p, unsigned long tls) 480 { 481 int err; 482 struct pt_regs *childregs; 483 struct fork_frame *fork_frame; 484 struct inactive_task_frame *frame; 485 struct task_struct *me = current; 486 487 childregs = task_pt_regs(p); 488 fork_frame = container_of(childregs, struct fork_frame, regs); 489 frame = &fork_frame->frame; 490 491 frame->bp = 0; 492 frame->ret_addr = (unsigned long) ret_from_fork; 493 p->thread.sp = (unsigned long) fork_frame; 494 p->thread.io_bitmap_ptr = NULL; 495 496 save_fsgs(me); 497 p->thread.fsindex = me->thread.fsindex; 498 p->thread.fsbase = me->thread.fsbase; 499 p->thread.gsindex = me->thread.gsindex; 500 p->thread.gsbase = me->thread.gsbase; 501 savesegment(es, p->thread.es); 502 savesegment(ds, p->thread.ds); 503 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 504 505 if (unlikely(p->flags & PF_KTHREAD)) { 506 /* kernel thread */ 507 memset(childregs, 0, sizeof(struct pt_regs)); 508 frame->bx = sp; /* function */ 509 frame->r12 = arg; 510 return 0; 511 } 512 frame->bx = 0; 513 *childregs = *current_pt_regs(); 514 515 childregs->ax = 0; 516 if (sp) 517 childregs->sp = sp; 518 519 err = -ENOMEM; 520 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 521 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, 522 IO_BITMAP_BYTES, GFP_KERNEL); 523 if (!p->thread.io_bitmap_ptr) { 524 p->thread.io_bitmap_max = 0; 525 return -ENOMEM; 526 } 527 set_tsk_thread_flag(p, TIF_IO_BITMAP); 528 } 529 530 /* 531 * Set a new TLS for the child thread? 532 */ 533 if (clone_flags & CLONE_SETTLS) { 534 #ifdef CONFIG_IA32_EMULATION 535 if (in_ia32_syscall()) 536 err = do_set_thread_area(p, -1, 537 (struct user_desc __user *)tls, 0); 538 else 539 #endif 540 err = do_arch_prctl_64(p, ARCH_SET_FS, tls); 541 if (err) 542 goto out; 543 } 544 err = 0; 545 out: 546 if (err && p->thread.io_bitmap_ptr) { 547 kfree(p->thread.io_bitmap_ptr); 548 p->thread.io_bitmap_max = 0; 549 } 550 551 return err; 552 } 553 554 static void 555 start_thread_common(struct pt_regs *regs, unsigned long new_ip, 556 unsigned long new_sp, 557 unsigned int _cs, unsigned int _ss, unsigned int _ds) 558 { 559 WARN_ON_ONCE(regs != current_pt_regs()); 560 561 if (static_cpu_has(X86_BUG_NULL_SEG)) { 562 /* Loading zero below won't clear the base. */ 563 loadsegment(fs, __USER_DS); 564 load_gs_index(__USER_DS); 565 } 566 567 loadsegment(fs, 0); 568 loadsegment(es, _ds); 569 loadsegment(ds, _ds); 570 load_gs_index(0); 571 572 regs->ip = new_ip; 573 regs->sp = new_sp; 574 regs->cs = _cs; 575 regs->ss = _ss; 576 regs->flags = X86_EFLAGS_IF; 577 force_iret(); 578 } 579 580 void 581 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 582 { 583 start_thread_common(regs, new_ip, new_sp, 584 __USER_CS, __USER_DS, 0); 585 } 586 EXPORT_SYMBOL_GPL(start_thread); 587 588 #ifdef CONFIG_COMPAT 589 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) 590 { 591 start_thread_common(regs, new_ip, new_sp, 592 test_thread_flag(TIF_X32) 593 ? __USER_CS : __USER32_CS, 594 __USER_DS, __USER_DS); 595 } 596 #endif 597 598 /* 599 * switch_to(x,y) should switch tasks from x to y. 600 * 601 * This could still be optimized: 602 * - fold all the options into a flag word and test it with a single test. 603 * - could test fs/gs bitsliced 604 * 605 * Kprobes not supported here. Set the probe on schedule instead. 606 * Function graph tracer not supported too. 607 */ 608 __visible __notrace_funcgraph struct task_struct * 609 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 610 { 611 struct thread_struct *prev = &prev_p->thread; 612 struct thread_struct *next = &next_p->thread; 613 struct fpu *prev_fpu = &prev->fpu; 614 struct fpu *next_fpu = &next->fpu; 615 int cpu = smp_processor_id(); 616 617 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && 618 this_cpu_read(irq_count) != -1); 619 620 if (!test_thread_flag(TIF_NEED_FPU_LOAD)) 621 switch_fpu_prepare(prev_fpu, cpu); 622 623 /* We must save %fs and %gs before load_TLS() because 624 * %fs and %gs may be cleared by load_TLS(). 625 * 626 * (e.g. xen_load_tls()) 627 */ 628 save_fsgs(prev_p); 629 630 /* 631 * Load TLS before restoring any segments so that segment loads 632 * reference the correct GDT entries. 633 */ 634 load_TLS(next, cpu); 635 636 /* 637 * Leave lazy mode, flushing any hypercalls made here. This 638 * must be done after loading TLS entries in the GDT but before 639 * loading segments that might reference them. 640 */ 641 arch_end_context_switch(next_p); 642 643 /* Switch DS and ES. 644 * 645 * Reading them only returns the selectors, but writing them (if 646 * nonzero) loads the full descriptor from the GDT or LDT. The 647 * LDT for next is loaded in switch_mm, and the GDT is loaded 648 * above. 649 * 650 * We therefore need to write new values to the segment 651 * registers on every context switch unless both the new and old 652 * values are zero. 653 * 654 * Note that we don't need to do anything for CS and SS, as 655 * those are saved and restored as part of pt_regs. 656 */ 657 savesegment(es, prev->es); 658 if (unlikely(next->es | prev->es)) 659 loadsegment(es, next->es); 660 661 savesegment(ds, prev->ds); 662 if (unlikely(next->ds | prev->ds)) 663 loadsegment(ds, next->ds); 664 665 x86_fsgsbase_load(prev, next); 666 667 /* 668 * Switch the PDA and FPU contexts. 669 */ 670 this_cpu_write(current_task, next_p); 671 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); 672 673 switch_fpu_finish(next_fpu); 674 675 /* Reload sp0. */ 676 update_task_stack(next_p); 677 678 switch_to_extra(prev_p, next_p); 679 680 #ifdef CONFIG_XEN_PV 681 /* 682 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and 683 * current_pt_regs()->flags may not match the current task's 684 * intended IOPL. We need to switch it manually. 685 */ 686 if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && 687 prev->iopl != next->iopl)) 688 xen_set_iopl_mask(next->iopl); 689 #endif 690 691 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { 692 /* 693 * AMD CPUs have a misfeature: SYSRET sets the SS selector but 694 * does not update the cached descriptor. As a result, if we 695 * do SYSRET while SS is NULL, we'll end up in user mode with 696 * SS apparently equal to __USER_DS but actually unusable. 697 * 698 * The straightforward workaround would be to fix it up just 699 * before SYSRET, but that would slow down the system call 700 * fast paths. Instead, we ensure that SS is never NULL in 701 * system call context. We do this by replacing NULL SS 702 * selectors at every context switch. SYSCALL sets up a valid 703 * SS, so the only way to get NULL is to re-enter the kernel 704 * from CPL 3 through an interrupt. Since that can't happen 705 * in the same task as a running syscall, we are guaranteed to 706 * context switch between every interrupt vector entry and a 707 * subsequent SYSRET. 708 * 709 * We read SS first because SS reads are much faster than 710 * writes. Out of caution, we force SS to __KERNEL_DS even if 711 * it previously had a different non-NULL value. 712 */ 713 unsigned short ss_sel; 714 savesegment(ss, ss_sel); 715 if (ss_sel != __KERNEL_DS) 716 loadsegment(ss, __KERNEL_DS); 717 } 718 719 /* Load the Intel cache allocation PQR MSR. */ 720 resctrl_sched_in(); 721 722 return prev_p; 723 } 724 725 void set_personality_64bit(void) 726 { 727 /* inherit personality from parent */ 728 729 /* Make sure to be in 64bit mode */ 730 clear_thread_flag(TIF_IA32); 731 clear_thread_flag(TIF_ADDR32); 732 clear_thread_flag(TIF_X32); 733 /* Pretend that this comes from a 64bit execve */ 734 task_pt_regs(current)->orig_ax = __NR_execve; 735 current_thread_info()->status &= ~TS_COMPAT; 736 737 /* Ensure the corresponding mm is not marked. */ 738 if (current->mm) 739 current->mm->context.ia32_compat = 0; 740 741 /* TBD: overwrites user setup. Should have two bits. 742 But 64bit processes have always behaved this way, 743 so it's not too bad. The main problem is just that 744 32bit children are affected again. */ 745 current->personality &= ~READ_IMPLIES_EXEC; 746 } 747 748 static void __set_personality_x32(void) 749 { 750 #ifdef CONFIG_X86_X32 751 clear_thread_flag(TIF_IA32); 752 set_thread_flag(TIF_X32); 753 if (current->mm) 754 current->mm->context.ia32_compat = TIF_X32; 755 current->personality &= ~READ_IMPLIES_EXEC; 756 /* 757 * in_32bit_syscall() uses the presence of the x32 syscall bit 758 * flag to determine compat status. The x86 mmap() code relies on 759 * the syscall bitness so set x32 syscall bit right here to make 760 * in_32bit_syscall() work during exec(). 761 * 762 * Pretend to come from a x32 execve. 763 */ 764 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; 765 current_thread_info()->status &= ~TS_COMPAT; 766 #endif 767 } 768 769 static void __set_personality_ia32(void) 770 { 771 #ifdef CONFIG_IA32_EMULATION 772 set_thread_flag(TIF_IA32); 773 clear_thread_flag(TIF_X32); 774 if (current->mm) 775 current->mm->context.ia32_compat = TIF_IA32; 776 current->personality |= force_personality32; 777 /* Prepare the first "return" to user space */ 778 task_pt_regs(current)->orig_ax = __NR_ia32_execve; 779 current_thread_info()->status |= TS_COMPAT; 780 #endif 781 } 782 783 void set_personality_ia32(bool x32) 784 { 785 /* Make sure to be in 32bit mode */ 786 set_thread_flag(TIF_ADDR32); 787 788 if (x32) 789 __set_personality_x32(); 790 else 791 __set_personality_ia32(); 792 } 793 EXPORT_SYMBOL_GPL(set_personality_ia32); 794 795 #ifdef CONFIG_CHECKPOINT_RESTORE 796 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) 797 { 798 int ret; 799 800 ret = map_vdso_once(image, addr); 801 if (ret) 802 return ret; 803 804 return (long)image->size; 805 } 806 #endif 807 808 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) 809 { 810 int ret = 0; 811 812 switch (option) { 813 case ARCH_SET_GS: { 814 if (unlikely(arg2 >= TASK_SIZE_MAX)) 815 return -EPERM; 816 817 preempt_disable(); 818 /* 819 * ARCH_SET_GS has always overwritten the index 820 * and the base. Zero is the most sensible value 821 * to put in the index, and is the only value that 822 * makes any sense if FSGSBASE is unavailable. 823 */ 824 if (task == current) { 825 loadseg(GS, 0); 826 x86_gsbase_write_cpu_inactive(arg2); 827 828 /* 829 * On non-FSGSBASE systems, save_base_legacy() expects 830 * that we also fill in thread.gsbase. 831 */ 832 task->thread.gsbase = arg2; 833 834 } else { 835 task->thread.gsindex = 0; 836 x86_gsbase_write_task(task, arg2); 837 } 838 preempt_enable(); 839 break; 840 } 841 case ARCH_SET_FS: { 842 /* 843 * Not strictly needed for %fs, but do it for symmetry 844 * with %gs 845 */ 846 if (unlikely(arg2 >= TASK_SIZE_MAX)) 847 return -EPERM; 848 849 preempt_disable(); 850 /* 851 * Set the selector to 0 for the same reason 852 * as %gs above. 853 */ 854 if (task == current) { 855 loadseg(FS, 0); 856 x86_fsbase_write_cpu(arg2); 857 858 /* 859 * On non-FSGSBASE systems, save_base_legacy() expects 860 * that we also fill in thread.fsbase. 861 */ 862 task->thread.fsbase = arg2; 863 } else { 864 task->thread.fsindex = 0; 865 x86_fsbase_write_task(task, arg2); 866 } 867 preempt_enable(); 868 break; 869 } 870 case ARCH_GET_FS: { 871 unsigned long base = x86_fsbase_read_task(task); 872 873 ret = put_user(base, (unsigned long __user *)arg2); 874 break; 875 } 876 case ARCH_GET_GS: { 877 unsigned long base = x86_gsbase_read_task(task); 878 879 ret = put_user(base, (unsigned long __user *)arg2); 880 break; 881 } 882 883 #ifdef CONFIG_CHECKPOINT_RESTORE 884 # ifdef CONFIG_X86_X32_ABI 885 case ARCH_MAP_VDSO_X32: 886 return prctl_map_vdso(&vdso_image_x32, arg2); 887 # endif 888 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 889 case ARCH_MAP_VDSO_32: 890 return prctl_map_vdso(&vdso_image_32, arg2); 891 # endif 892 case ARCH_MAP_VDSO_64: 893 return prctl_map_vdso(&vdso_image_64, arg2); 894 #endif 895 896 default: 897 ret = -EINVAL; 898 break; 899 } 900 901 return ret; 902 } 903 904 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 905 { 906 long ret; 907 908 ret = do_arch_prctl_64(current, option, arg2); 909 if (ret == -EINVAL) 910 ret = do_arch_prctl_common(current, option, arg2); 911 912 return ret; 913 } 914 915 #ifdef CONFIG_IA32_EMULATION 916 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 917 { 918 return do_arch_prctl_common(current, option, arg2); 919 } 920 #endif 921 922 unsigned long KSTK_ESP(struct task_struct *task) 923 { 924 return task_pt_regs(task)->sp; 925 } 926