1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1982, 1986 The Regents of the University of California. 5 * Copyright (c) 1989, 1990 William Jolitz 6 * Copyright (c) 1994 John Dyson 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department, and William Jolitz. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the University of 24 * California, Berkeley and its contributors. 25 * 4. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 42 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ 43 */ 44 45 #include <sys/cdefs.h> 46 #include "opt_isa.h" 47 #include "opt_npx.h" 48 #include "opt_reset.h" 49 #include "opt_cpu.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/bio.h> 54 #include <sys/buf.h> 55 #include <sys/kernel.h> 56 #include <sys/ktr.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mutex.h> 61 #include <sys/proc.h> 62 #include <sys/sysent.h> 63 #include <sys/sf_buf.h> 64 #include <sys/smp.h> 65 #include <sys/sched.h> 66 #include <sys/sysctl.h> 67 #include <sys/unistd.h> 68 #include <sys/vnode.h> 69 #include <sys/vmmeter.h> 70 71 #include <machine/cpu.h> 72 #include <machine/cputypes.h> 73 #include <machine/md_var.h> 74 #include <machine/pcb.h> 75 #include <machine/pcb_ext.h> 76 #include <machine/smp.h> 77 #include <machine/vm86.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_extern.h> 81 #include <vm/vm_kern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_param.h> 85 86 _Static_assert(__OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf), 87 "__OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf."); 88 89 union savefpu * 90 get_pcb_user_save_td(struct thread *td) 91 { 92 vm_offset_t p; 93 94 p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 95 roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN); 96 KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area")); 97 return ((union savefpu *)p); 98 } 99 100 union savefpu * 101 get_pcb_user_save_pcb(struct pcb *pcb) 102 { 103 vm_offset_t p; 104 105 p = (vm_offset_t)(pcb + 1); 106 return ((union savefpu *)p); 107 } 108 109 struct pcb * 110 get_pcb_td(struct thread *td) 111 { 112 vm_offset_t p; 113 114 p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 115 roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) - 116 sizeof(struct pcb); 117 return ((struct pcb *)p); 118 } 119 120 void * 121 alloc_fpusave(int flags) 122 { 123 void *res; 124 struct savefpu_ymm *sf; 125 126 res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags); 127 if (use_xsave) { 128 sf = (struct savefpu_ymm *)res; 129 bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd)); 130 sf->sv_xstate.sx_hd.xstate_bv = xsave_mask; 131 } 132 return (res); 133 } 134 135 /* 136 * Common code shared between cpu_fork() and cpu_copy_thread() for 137 * initializing a thread. 138 */ 139 static void 140 copy_thread(struct thread *td1, struct thread *td2) 141 { 142 struct pcb *pcb2; 143 144 pcb2 = td2->td_pcb; 145 146 /* Ensure that td1's pcb is up to date for user threads. */ 147 if ((td2->td_pflags & TDP_KTHREAD) == 0) { 148 MPASS(td1 == curthread); 149 td1->td_pcb->pcb_gs = rgs(); 150 critical_enter(); 151 if (PCPU_GET(fpcurthread) == td1) 152 npxsave(td1->td_pcb->pcb_save); 153 critical_exit(); 154 } 155 156 /* Copy td1's pcb */ 157 bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); 158 159 /* Properly initialize pcb_save */ 160 pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); 161 162 /* Kernel threads start with clean NPX and segment bases. */ 163 if ((td2->td_pflags & TDP_KTHREAD) != 0) { 164 pcb2->pcb_gs = _udatasel; 165 set_fsbase(td2, 0); 166 set_gsbase(td2, 0); 167 pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE | 168 PCB_KERNNPX | PCB_KERNNPX_THR); 169 } else { 170 MPASS((pcb2->pcb_flags & (PCB_KERNNPX | PCB_KERNNPX_THR)) == 0); 171 bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2), 172 cpu_max_ext_state_size); 173 } 174 175 /* 176 * Set registers for trampoline to user mode. Leave space for the 177 * return address on stack. These are the kernel mode register values. 178 */ 179 pcb2->pcb_edi = 0; 180 pcb2->pcb_esi = (int)fork_return; /* trampoline arg */ 181 pcb2->pcb_ebp = 0; 182 pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *); /* trampoline arg */ 183 pcb2->pcb_ebx = (int)td2; /* trampoline arg */ 184 pcb2->pcb_eip = (int)fork_trampoline + setidt_disp; 185 /* 186 * If we didn't copy the pcb, we'd need to do the following registers: 187 * pcb2->pcb_cr3: cloned above. 188 * pcb2->pcb_dr*: cloned above. 189 * pcb2->pcb_savefpu: cloned above. 190 * pcb2->pcb_flags: cloned above. 191 * pcb2->pcb_onfault: cloned above (always NULL here?). 192 * pcb2->pcb_gs: cloned above. 193 * pcb2->pcb_ext: cleared below. 194 */ 195 pcb2->pcb_ext = NULL; 196 197 /* Setup to release spin count in fork_exit(). */ 198 td2->td_md.md_spinlock_count = 1; 199 td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; 200 } 201 202 /* 203 * Finish a fork operation, with process p2 nearly set up. 204 * Copy and update the pcb, set up the stack so that the child 205 * ready to run and return to user mode. 206 */ 207 void 208 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) 209 { 210 struct proc *p1; 211 struct pcb *pcb2; 212 struct mdproc *mdp2; 213 214 p1 = td1->td_proc; 215 if ((flags & RFPROC) == 0) { 216 if ((flags & RFMEM) == 0) { 217 /* unshare user LDT */ 218 struct mdproc *mdp1 = &p1->p_md; 219 struct proc_ldt *pldt, *pldt1; 220 221 mtx_lock_spin(&dt_lock); 222 if ((pldt1 = mdp1->md_ldt) != NULL && 223 pldt1->ldt_refcnt > 1) { 224 pldt = user_ldt_alloc(mdp1, pldt1->ldt_len); 225 if (pldt == NULL) 226 panic("could not copy LDT"); 227 mdp1->md_ldt = pldt; 228 set_user_ldt(mdp1); 229 user_ldt_deref(pldt1); 230 } else 231 mtx_unlock_spin(&dt_lock); 232 } 233 return; 234 } 235 236 /* Point the pcb to the top of the stack */ 237 pcb2 = get_pcb_td(td2); 238 td2->td_pcb = pcb2; 239 240 copy_thread(td1, td2); 241 242 /* Reset debug registers in the new process */ 243 x86_clear_dbregs(pcb2); 244 245 /* Point mdproc and then copy over td1's contents */ 246 mdp2 = &p2->p_md; 247 bcopy(&p1->p_md, mdp2, sizeof(*mdp2)); 248 249 /* 250 * Copy the trap frame for the return to user mode as if from a 251 * syscall. This copies most of the user mode register values. 252 * The -VM86_STACK_SPACE (-16) is so we can expand the trapframe 253 * if we go to vm86. 254 */ 255 td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - 256 VM86_STACK_SPACE) - 1; 257 bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); 258 259 /* Set child return values. */ 260 p2->p_sysent->sv_set_fork_retval(td2); 261 262 /* 263 * If the parent process has the trap bit set (i.e. a debugger 264 * had single stepped the process to the system call), we need 265 * to clear the trap flag from the new frame. 266 */ 267 td2->td_frame->tf_eflags &= ~PSL_T; 268 269 /* Set cr3 for the new process. */ 270 pcb2->pcb_cr3 = pmap_get_cr3(vmspace_pmap(p2->p_vmspace)); 271 272 /* 273 * XXX don't copy the i/o pages. this should probably be fixed. 274 */ 275 pcb2->pcb_ext = NULL; 276 277 /* Copy the LDT, if necessary. */ 278 mtx_lock_spin(&dt_lock); 279 if (mdp2->md_ldt != NULL) { 280 if (flags & RFMEM) { 281 mdp2->md_ldt->ldt_refcnt++; 282 } else { 283 mdp2->md_ldt = user_ldt_alloc(mdp2, 284 mdp2->md_ldt->ldt_len); 285 if (mdp2->md_ldt == NULL) 286 panic("could not copy LDT"); 287 } 288 } 289 mtx_unlock_spin(&dt_lock); 290 291 /* 292 * Now, cpu_switch() can schedule the new process. 293 * pcb_esp is loaded pointing to the cpu_switch() stack frame 294 * containing the return address when exiting cpu_switch. 295 * This will normally be to fork_trampoline(), which will have 296 * %ebx loaded with the new proc's pointer. fork_trampoline() 297 * will set up a stack to call fork_return(p, frame); to complete 298 * the return to user-mode. 299 */ 300 } 301 302 void 303 x86_set_fork_retval(struct thread *td) 304 { 305 struct trapframe * frame = td->td_frame; 306 307 frame->tf_eax = 0; /* Child returns zero */ 308 frame->tf_eflags &= ~PSL_C; /* success */ 309 frame->tf_edx = 1; /* System V emulation */ 310 } 311 312 /* 313 * Intercept the return address from a freshly forked process that has NOT 314 * been scheduled yet. 315 * 316 * This is needed to make kernel threads stay in kernel mode. 317 */ 318 void 319 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) 320 { 321 /* 322 * Note that the trap frame follows the args, so the function 323 * is really called like this: func(arg, frame); 324 */ 325 td->td_pcb->pcb_esi = (int) func; /* function */ 326 td->td_pcb->pcb_ebx = (int) arg; /* first arg */ 327 } 328 329 void 330 cpu_exit(struct thread *td) 331 { 332 333 /* 334 * If this process has a custom LDT, release it. Reset pc->pcb_gs 335 * and %gs before we free it in case they refer to an LDT entry. 336 */ 337 mtx_lock_spin(&dt_lock); 338 if (td->td_proc->p_md.md_ldt) { 339 td->td_pcb->pcb_gs = _udatasel; 340 load_gs(_udatasel); 341 user_ldt_free(td); 342 } else 343 mtx_unlock_spin(&dt_lock); 344 } 345 346 void 347 cpu_thread_exit(struct thread *td) 348 { 349 350 critical_enter(); 351 if (td == PCPU_GET(fpcurthread)) 352 npxdrop(); 353 critical_exit(); 354 355 /* Disable any hardware breakpoints. */ 356 if (td->td_pcb->pcb_flags & PCB_DBREGS) { 357 reset_dbregs(); 358 td->td_pcb->pcb_flags &= ~PCB_DBREGS; 359 } 360 } 361 362 void 363 cpu_thread_clean(struct thread *td) 364 { 365 struct pcb *pcb; 366 367 pcb = td->td_pcb; 368 if (pcb->pcb_ext != NULL) { 369 /* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */ 370 /* 371 * XXX do we need to move the TSS off the allocated pages 372 * before freeing them? (not done here) 373 */ 374 pmap_trm_free(pcb->pcb_ext, ctob(IOPAGES + 1)); 375 pcb->pcb_ext = NULL; 376 } 377 } 378 379 void 380 cpu_thread_swapin(struct thread *td) 381 { 382 } 383 384 void 385 cpu_thread_swapout(struct thread *td) 386 { 387 } 388 389 void 390 cpu_thread_alloc(struct thread *td) 391 { 392 struct pcb *pcb; 393 struct xstate_hdr *xhdr; 394 395 td->td_pcb = pcb = get_pcb_td(td); 396 td->td_frame = (struct trapframe *)((caddr_t)pcb - 397 VM86_STACK_SPACE) - 1; 398 pcb->pcb_ext = NULL; 399 pcb->pcb_save = get_pcb_user_save_pcb(pcb); 400 if (use_xsave) { 401 xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1); 402 bzero(xhdr, sizeof(*xhdr)); 403 xhdr->xstate_bv = xsave_mask; 404 } 405 } 406 407 void 408 cpu_thread_free(struct thread *td) 409 { 410 411 cpu_thread_clean(td); 412 } 413 414 bool 415 cpu_exec_vmspace_reuse(struct proc *p __unused, vm_map_t map __unused) 416 { 417 418 return (true); 419 } 420 421 int 422 cpu_procctl(struct thread *td __unused, int idtype __unused, id_t id __unused, 423 int com __unused, void *data __unused) 424 { 425 426 return (EINVAL); 427 } 428 429 void 430 cpu_set_syscall_retval(struct thread *td, int error) 431 { 432 433 switch (error) { 434 case 0: 435 td->td_frame->tf_eax = td->td_retval[0]; 436 td->td_frame->tf_edx = td->td_retval[1]; 437 td->td_frame->tf_eflags &= ~PSL_C; 438 break; 439 440 case ERESTART: 441 /* 442 * Reconstruct pc, assuming lcall $X,y is 7 bytes, int 443 * 0x80 is 2 bytes. We saved this in tf_err. 444 */ 445 td->td_frame->tf_eip -= td->td_frame->tf_err; 446 break; 447 448 case EJUSTRETURN: 449 break; 450 451 default: 452 td->td_frame->tf_eax = error; 453 td->td_frame->tf_eflags |= PSL_C; 454 break; 455 } 456 } 457 458 /* 459 * Initialize machine state, mostly pcb and trap frame for a new 460 * thread, about to return to userspace. Put enough state in the new 461 * thread's PCB to get it to go back to the fork_return(), which 462 * finalizes the thread state and handles peculiarities of the first 463 * return to userspace for the new thread. 464 */ 465 void 466 cpu_copy_thread(struct thread *td, struct thread *td0) 467 { 468 copy_thread(td0, td); 469 470 /* 471 * Copy user general-purpose registers. 472 * 473 * Some of these registers are rewritten by cpu_set_upcall() 474 * and linux_set_upcall(). 475 */ 476 bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); 477 478 /* If the current thread has the trap bit set (i.e. a debugger had 479 * single stepped the process to the system call), we need to clear 480 * the trap flag from the new frame. Otherwise, the new thread will 481 * receive a (likely unexpected) SIGTRAP when it executes the first 482 * instruction after returning to userland. 483 */ 484 td->td_frame->tf_eflags &= ~PSL_T; 485 } 486 487 /* 488 * Set that machine state for performing an upcall that starts 489 * the entry function with the given argument. 490 */ 491 void 492 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, 493 stack_t *stack) 494 { 495 496 /* 497 * Do any extra cleaning that needs to be done. 498 * The thread may have optional components 499 * that are not present in a fresh thread. 500 * This may be a recycled thread so make it look 501 * as though it's newly allocated. 502 */ 503 cpu_thread_clean(td); 504 505 /* 506 * Set the trap frame to point at the beginning of the entry 507 * function. 508 */ 509 td->td_frame->tf_ebp = 0; 510 td->td_frame->tf_esp = 511 (((int)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4; 512 td->td_frame->tf_eip = (int)entry; 513 514 /* Return address sentinel value to stop stack unwinding. */ 515 suword((void *)td->td_frame->tf_esp, 0); 516 517 /* Pass the argument to the entry point. */ 518 suword((void *)(td->td_frame->tf_esp + sizeof(void *)), 519 (int)arg); 520 } 521 522 int 523 cpu_set_user_tls(struct thread *td, void *tls_base) 524 { 525 struct segment_descriptor sd; 526 uint32_t base; 527 528 /* 529 * Construct a descriptor and store it in the pcb for 530 * the next context switch. Also store it in the gdt 531 * so that the load of tf_fs into %fs will activate it 532 * at return to userland. 533 */ 534 base = (uint32_t)tls_base; 535 sd.sd_lobase = base & 0xffffff; 536 sd.sd_hibase = (base >> 24) & 0xff; 537 sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ 538 sd.sd_hilimit = 0xf; 539 sd.sd_type = SDT_MEMRWA; 540 sd.sd_dpl = SEL_UPL; 541 sd.sd_p = 1; 542 sd.sd_xx = 0; 543 sd.sd_def32 = 1; 544 sd.sd_gran = 1; 545 critical_enter(); 546 /* set %gs */ 547 td->td_pcb->pcb_gsd = sd; 548 if (td == curthread) { 549 PCPU_GET(fsgs_gdt)[1] = sd; 550 load_gs(GSEL(GUGS_SEL, SEL_UPL)); 551 } 552 critical_exit(); 553 return (0); 554 } 555 556 /* 557 * Convert kernel VA to physical address 558 */ 559 vm_paddr_t 560 kvtop(void *addr) 561 { 562 vm_paddr_t pa; 563 564 pa = pmap_kextract((vm_offset_t)addr); 565 if (pa == 0) 566 panic("kvtop: zero page frame"); 567 return (pa); 568 } 569 570 /* 571 * Get an sf_buf from the freelist. May block if none are available. 572 */ 573 void 574 sf_buf_map(struct sf_buf *sf, int flags) 575 { 576 577 pmap_sf_buf_map(sf); 578 #ifdef SMP 579 sf_buf_shootdown(sf, flags); 580 #endif 581 } 582 583 #ifdef SMP 584 static void 585 sf_buf_shootdown_curcpu_cb(pmap_t pmap __unused, 586 vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) 587 { 588 } 589 590 void 591 sf_buf_shootdown(struct sf_buf *sf, int flags) 592 { 593 cpuset_t other_cpus; 594 u_int cpuid; 595 596 sched_pin(); 597 cpuid = PCPU_GET(cpuid); 598 if (!CPU_ISSET(cpuid, &sf->cpumask)) { 599 CPU_SET(cpuid, &sf->cpumask); 600 invlpg(sf->kva); 601 } 602 if ((flags & SFB_CPUPRIVATE) == 0) { 603 other_cpus = all_cpus; 604 CPU_CLR(cpuid, &other_cpus); 605 CPU_ANDNOT(&other_cpus, &other_cpus, &sf->cpumask); 606 if (!CPU_EMPTY(&other_cpus)) { 607 CPU_OR(&sf->cpumask, &sf->cpumask, &other_cpus); 608 smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap, 609 sf_buf_shootdown_curcpu_cb); 610 } 611 } 612 sched_unpin(); 613 } 614 #endif 615 616 /* 617 * MD part of sf_buf_free(). 618 */ 619 int 620 sf_buf_unmap(struct sf_buf *sf) 621 { 622 623 return (0); 624 } 625 626 static void 627 sf_buf_invalidate(struct sf_buf *sf) 628 { 629 vm_page_t m = sf->m; 630 631 /* 632 * Use pmap_qenter to update the pte for 633 * existing mapping, in particular, the PAT 634 * settings are recalculated. 635 */ 636 pmap_qenter(sf->kva, &m, 1); 637 pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE); 638 } 639 640 /* 641 * Invalidate the cache lines that may belong to the page, if 642 * (possibly old) mapping of the page by sf buffer exists. Returns 643 * TRUE when mapping was found and cache invalidated. 644 */ 645 boolean_t 646 sf_buf_invalidate_cache(vm_page_t m) 647 { 648 649 return (sf_buf_process_page(m, sf_buf_invalidate)); 650 } 651