1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 41 #include "opt_atpic.h" 42 #include "opt_cpu.h" 43 #include "opt_ddb.h" 44 #include "opt_inet.h" 45 #include "opt_isa.h" 46 #include "opt_kstack_pages.h" 47 #include "opt_maxmem.h" 48 #include "opt_pci.h" 49 #include "opt_platform.h" 50 #include "opt_sched.h" 51 52 #include <sys/param.h> 53 #include <sys/proc.h> 54 #include <sys/systm.h> 55 #include <sys/asan.h> 56 #include <sys/bio.h> 57 #include <sys/buf.h> 58 #include <sys/bus.h> 59 #include <sys/callout.h> 60 #include <sys/cons.h> 61 #include <sys/cpu.h> 62 #include <sys/csan.h> 63 #include <sys/efi.h> 64 #include <sys/eventhandler.h> 65 #include <sys/exec.h> 66 #include <sys/imgact.h> 67 #include <sys/kdb.h> 68 #include <sys/kernel.h> 69 #include <sys/ktr.h> 70 #include <sys/linker.h> 71 #include <sys/lock.h> 72 #include <sys/malloc.h> 73 #include <sys/memrange.h> 74 #include <sys/msan.h> 75 #include <sys/msgbuf.h> 76 #include <sys/mutex.h> 77 #include <sys/pcpu.h> 78 #include <sys/ptrace.h> 79 #include <sys/reboot.h> 80 #include <sys/reg.h> 81 #include <sys/rwlock.h> 82 #include <sys/sched.h> 83 #include <sys/signalvar.h> 84 #include <sys/smp.h> 85 #include <sys/syscallsubr.h> 86 #include <sys/sysctl.h> 87 #include <sys/sysent.h> 88 #include <sys/sysproto.h> 89 #include <sys/ucontext.h> 90 #include <sys/vmmeter.h> 91 92 #include <vm/vm.h> 93 #include <vm/vm_param.h> 94 #include <vm/vm_extern.h> 95 #include <vm/vm_kern.h> 96 #include <vm/vm_page.h> 97 #include <vm/vm_map.h> 98 #include <vm/vm_object.h> 99 #include <vm/vm_pager.h> 100 #include <vm/vm_phys.h> 101 #include <vm/vm_dumpset.h> 102 103 #ifdef DDB 104 #ifndef KDB 105 #error KDB must be enabled in order for DDB to work! 106 #endif 107 #include <ddb/ddb.h> 108 #include <ddb/db_sym.h> 109 #endif 110 111 #include <net/netisr.h> 112 113 #include <dev/smbios/smbios.h> 114 115 #include <machine/clock.h> 116 #include <machine/cpu.h> 117 #include <machine/cputypes.h> 118 #include <machine/frame.h> 119 #include <machine/intr_machdep.h> 120 #include <x86/mca.h> 121 #include <machine/md_var.h> 122 #include <machine/metadata.h> 123 #include <machine/pc/bios.h> 124 #include <machine/pcb.h> 125 #include <machine/proc.h> 126 #include <machine/sigframe.h> 127 #include <machine/specialreg.h> 128 #include <machine/trap.h> 129 #include <machine/tss.h> 130 #include <x86/ucode.h> 131 #include <x86/ifunc.h> 132 #include <machine/smp.h> 133 #ifdef FDT 134 #include <x86/fdt.h> 135 #endif 136 137 #ifdef DEV_ATPIC 138 #include <x86/isa/icu.h> 139 #else 140 #include <x86/apicvar.h> 141 #endif 142 143 #include <isa/isareg.h> 144 #include <isa/rtc.h> 145 #include <x86/init.h> 146 147 #ifndef SMP 148 #error amd64 requires options SMP 149 #endif 150 151 /* Sanity check for __curthread() */ 152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 153 154 /* 155 * The PTI trampoline stack needs enough space for a hardware trapframe and a 156 * couple of scratch registers, as well as the trapframe left behind after an 157 * iret fault. 158 */ 159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 160 offsetof(struct pti_frame, pti_rip)); 161 162 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 163 164 static void cpu_startup(void *); 165 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 166 167 /* Probe 8254 PIT and TSC. */ 168 static void native_clock_source_init(void); 169 170 /* Preload data parse function */ 171 static void native_parse_preload_data(u_int64_t); 172 173 /* Native function to fetch and parse the e820 map */ 174 static void native_parse_memmap(vm_paddr_t *, int *); 175 176 /* Default init_ops implementation. */ 177 struct init_ops init_ops = { 178 .parse_preload_data = native_parse_preload_data, 179 .early_clock_source_init = native_clock_source_init, 180 .early_delay = i8254_delay, 181 .parse_memmap = native_parse_memmap, 182 }; 183 184 /* 185 * Physical address of the EFI System Table. Stashed from the metadata hints 186 * passed into the kernel and used by the EFI code to call runtime services. 187 */ 188 vm_paddr_t efi_systbl_phys; 189 190 /* 191 * Bitmap of extra EFI memory region types that should be preserved and mapped 192 * during runtime services calls. 193 */ 194 uint32_t efi_map_regs; 195 196 /* Intel ICH registers */ 197 #define ICH_PMBASE 0x400 198 #define ICH_SMI_EN ICH_PMBASE + 0x30 199 200 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 201 202 int cold = 1; 203 204 long Maxmem = 0; 205 long realmem = 0; 206 int late_console = 1; 207 int lass_enabled = 0; 208 209 struct kva_md_info kmi; 210 211 struct region_descriptor r_idt; 212 213 struct pcpu *__pcpu; 214 struct pcpu temp_bsp_pcpu; 215 216 struct mtx icu_lock; 217 218 struct mem_range_softc mem_range_softc; 219 220 struct mtx dt_lock; /* lock for GDT and LDT */ 221 222 void (*vmm_suspend_p)(void); 223 void (*vmm_resume_p)(void); 224 225 bool efi_boot; 226 227 static void 228 cpu_startup(void *dummy) 229 { 230 uintmax_t memsize; 231 char *sysenv; 232 233 /* 234 * On MacBooks, we need to disallow the legacy USB circuit to 235 * generate an SMI# because this can cause several problems, 236 * namely: incorrect CPU frequency detection and failure to 237 * start the APs. 238 * We do this by disabling a bit in the SMI_EN (SMI Control and 239 * Enable register) of the Intel ICH LPC Interface Bridge. 240 */ 241 sysenv = kern_getenv("smbios.system.product"); 242 if (sysenv != NULL) { 243 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 244 strncmp(sysenv, "MacBook3,1", 10) == 0 || 245 strncmp(sysenv, "MacBook4,1", 10) == 0 || 246 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 247 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 248 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 249 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 250 strncmp(sysenv, "Macmini1,1", 10) == 0) { 251 if (bootverbose) 252 printf("Disabling LEGACY_USB_EN bit on " 253 "Intel ICH.\n"); 254 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 255 } 256 freeenv(sysenv); 257 } 258 259 /* 260 * Good {morning,afternoon,evening,night}. 261 */ 262 startrtclock(); 263 printcpuinfo(); 264 265 /* 266 * Display physical memory if SMBIOS reports reasonable amount. 267 */ 268 memsize = 0; 269 sysenv = kern_getenv("smbios.memory.enabled"); 270 if (sysenv != NULL) { 271 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 272 freeenv(sysenv); 273 } 274 if (memsize < ptoa((uintmax_t)vm_free_count())) 275 memsize = ptoa((uintmax_t)Maxmem); 276 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 277 realmem = atop(memsize); 278 279 /* 280 * Display any holes after the first chunk of extended memory. 281 */ 282 if (bootverbose) { 283 int indx; 284 285 printf("Physical memory chunk(s):\n"); 286 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 287 vm_paddr_t size; 288 289 size = phys_avail[indx + 1] - phys_avail[indx]; 290 printf( 291 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 292 (uintmax_t)phys_avail[indx], 293 (uintmax_t)phys_avail[indx + 1] - 1, 294 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 295 } 296 } 297 298 vm_ksubmap_init(&kmi); 299 300 printf("avail memory = %ju (%ju MB)\n", 301 ptoa((uintmax_t)vm_free_count()), 302 ptoa((uintmax_t)vm_free_count()) / 1048576); 303 #ifdef DEV_PCI 304 if (bootverbose && intel_graphics_stolen_base != 0) 305 printf("intel stolen mem: base %#jx size %ju MB\n", 306 (uintmax_t)intel_graphics_stolen_base, 307 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 308 #endif 309 310 /* 311 * Set up buffers, so they can be used to read disk labels. 312 */ 313 bufinit(); 314 vm_pager_bufferinit(); 315 316 cpu_setregs(); 317 } 318 319 static void 320 late_ifunc_resolve(void *dummy __unused) 321 { 322 link_elf_late_ireloc(); 323 } 324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 325 326 void 327 cpu_setregs(void) 328 { 329 register_t cr0; 330 331 TSENTER(); 332 cr0 = rcr0(); 333 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 334 TSENTER2("load_cr0"); 335 load_cr0(cr0); 336 TSEXIT2("load_cr0"); 337 TSEXIT(); 338 } 339 340 /* 341 * Initialize amd64 and configure to run kernel 342 */ 343 344 /* 345 * Initialize segments & interrupt table 346 */ 347 static struct gate_descriptor idt0[NIDT]; 348 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 349 350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); 351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16); 352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); 353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); 354 CTASSERT(sizeof(struct nmi_pcpu) == 16); 355 356 /* 357 * Software prototypes -- in more palatable form. 358 * 359 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 360 * slots as corresponding segments for i386 kernel. 361 */ 362 struct soft_segment_descriptor gdt_segs[] = { 363 [GNULL_SEL] = { /* 0 Null Descriptor */ 364 .ssd_base = 0x0, 365 .ssd_limit = 0x0, 366 .ssd_type = 0, 367 .ssd_dpl = 0, 368 .ssd_p = 0, 369 .ssd_long = 0, 370 .ssd_def32 = 0, 371 .ssd_gran = 0 }, 372 [GNULL2_SEL] = { /* 1 Null Descriptor */ 373 .ssd_base = 0x0, 374 .ssd_limit = 0x0, 375 .ssd_type = 0, 376 .ssd_dpl = 0, 377 .ssd_p = 0, 378 .ssd_long = 0, 379 .ssd_def32 = 0, 380 .ssd_gran = 0 }, 381 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */ 382 .ssd_base = 0x0, 383 .ssd_limit = 0xfffff, 384 .ssd_type = SDT_MEMRWA, 385 .ssd_dpl = SEL_UPL, 386 .ssd_p = 1, 387 .ssd_long = 0, 388 .ssd_def32 = 1, 389 .ssd_gran = 1 }, 390 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */ 391 .ssd_base = 0x0, 392 .ssd_limit = 0xfffff, 393 .ssd_type = SDT_MEMRWA, 394 .ssd_dpl = SEL_UPL, 395 .ssd_p = 1, 396 .ssd_long = 0, 397 .ssd_def32 = 1, 398 .ssd_gran = 1 }, 399 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */ 400 .ssd_base = 0x0, 401 .ssd_limit = 0xfffff, 402 .ssd_type = SDT_MEMERA, 403 .ssd_dpl = SEL_KPL, 404 .ssd_p = 1, 405 .ssd_long = 1, 406 .ssd_def32 = 0, 407 .ssd_gran = 1 }, 408 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */ 409 .ssd_base = 0x0, 410 .ssd_limit = 0xfffff, 411 .ssd_type = SDT_MEMRWA, 412 .ssd_dpl = SEL_KPL, 413 .ssd_p = 1, 414 .ssd_long = 1, 415 .ssd_def32 = 0, 416 .ssd_gran = 1 }, 417 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */ 418 .ssd_base = 0x0, 419 .ssd_limit = 0xfffff, 420 .ssd_type = SDT_MEMERA, 421 .ssd_dpl = SEL_UPL, 422 .ssd_p = 1, 423 .ssd_long = 0, 424 .ssd_def32 = 1, 425 .ssd_gran = 1 }, 426 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */ 427 .ssd_base = 0x0, 428 .ssd_limit = 0xfffff, 429 .ssd_type = SDT_MEMRWA, 430 .ssd_dpl = SEL_UPL, 431 .ssd_p = 1, 432 .ssd_long = 0, 433 .ssd_def32 = 1, 434 .ssd_gran = 1 }, 435 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */ 436 .ssd_base = 0x0, 437 .ssd_limit = 0xfffff, 438 .ssd_type = SDT_MEMERA, 439 .ssd_dpl = SEL_UPL, 440 .ssd_p = 1, 441 .ssd_long = 1, 442 .ssd_def32 = 0, 443 .ssd_gran = 1 }, 444 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */ 445 .ssd_base = 0x0, 446 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 447 .ssd_type = SDT_SYSTSS, 448 .ssd_dpl = SEL_KPL, 449 .ssd_p = 1, 450 .ssd_long = 0, 451 .ssd_def32 = 0, 452 .ssd_gran = 0 }, 453 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */ 454 .ssd_base = 0x0, 455 .ssd_limit = 0x0, 456 .ssd_type = 0, 457 .ssd_dpl = 0, 458 .ssd_p = 0, 459 .ssd_long = 0, 460 .ssd_def32 = 0, 461 .ssd_gran = 0 }, 462 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */ 463 .ssd_base = 0x0, 464 .ssd_limit = 0x0, 465 .ssd_type = 0, 466 .ssd_dpl = 0, 467 .ssd_p = 0, 468 .ssd_long = 0, 469 .ssd_def32 = 0, 470 .ssd_gran = 0 }, 471 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */ 472 .ssd_base = 0x0, 473 .ssd_limit = 0x0, 474 .ssd_type = 0, 475 .ssd_dpl = 0, 476 .ssd_p = 0, 477 .ssd_long = 0, 478 .ssd_def32 = 0, 479 .ssd_gran = 0 }, 480 }; 481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 482 483 void 484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 485 { 486 struct gate_descriptor *ip; 487 488 ip = idt + idx; 489 ip->gd_looffset = (uintptr_t)func; 490 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 491 ip->gd_ist = ist; 492 ip->gd_xx = 0; 493 ip->gd_type = typ; 494 ip->gd_dpl = dpl; 495 ip->gd_p = 1; 496 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 497 } 498 499 extern inthand_t 500 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 501 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 502 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 503 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 504 IDTVEC(xmm), IDTVEC(dblfault), 505 IDTVEC(div_pti), IDTVEC(bpt_pti), 506 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 507 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 508 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 509 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 510 IDTVEC(xmm_pti), 511 #ifdef KDTRACE_HOOKS 512 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 513 #endif 514 #ifdef XENHVM 515 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 516 #endif 517 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 518 IDTVEC(fast_syscall_pti); 519 520 #ifdef DDB 521 /* 522 * Display the index and function name of any IDT entries that don't use 523 * the default 'rsvd' entry point. 524 */ 525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE) 526 { 527 struct gate_descriptor *ip; 528 int idx; 529 uintptr_t func; 530 531 ip = idt; 532 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 533 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 534 if (func != (uintptr_t)&IDTVEC(rsvd)) { 535 db_printf("%3d\t", idx); 536 db_printsym(func, DB_STGY_PROC); 537 db_printf("\n"); 538 } 539 ip++; 540 } 541 } 542 543 /* Show privileged registers. */ 544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE) 545 { 546 struct { 547 uint16_t limit; 548 uint64_t base; 549 } __packed idtr, gdtr; 550 uint16_t ldt, tr; 551 552 __asm __volatile("sidt %0" : "=m" (idtr)); 553 db_printf("idtr\t0x%016lx/%04x\n", 554 (u_long)idtr.base, (u_int)idtr.limit); 555 __asm __volatile("sgdt %0" : "=m" (gdtr)); 556 db_printf("gdtr\t0x%016lx/%04x\n", 557 (u_long)gdtr.base, (u_int)gdtr.limit); 558 __asm __volatile("sldt %0" : "=r" (ldt)); 559 db_printf("ldtr\t0x%04x\n", ldt); 560 __asm __volatile("str %0" : "=r" (tr)); 561 db_printf("tr\t0x%04x\n", tr); 562 db_printf("cr0\t0x%016lx\n", rcr0()); 563 db_printf("cr2\t0x%016lx\n", rcr2()); 564 db_printf("cr3\t0x%016lx\n", rcr3()); 565 db_printf("cr4\t0x%016lx\n", rcr4()); 566 if (rcr4() & CR4_XSAVE) 567 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 568 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 569 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 570 db_printf("FEATURES_CTL\t%016lx\n", 571 rdmsr(MSR_IA32_FEATURE_CONTROL)); 572 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 573 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 574 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 575 } 576 577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE) 578 { 579 580 db_printf("dr0\t0x%016lx\n", rdr0()); 581 db_printf("dr1\t0x%016lx\n", rdr1()); 582 db_printf("dr2\t0x%016lx\n", rdr2()); 583 db_printf("dr3\t0x%016lx\n", rdr3()); 584 db_printf("dr6\t0x%016lx\n", rdr6()); 585 db_printf("dr7\t0x%016lx\n", rdr7()); 586 } 587 #endif 588 589 void 590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 591 { 592 593 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 594 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 595 ssd->ssd_type = sd->sd_type; 596 ssd->ssd_dpl = sd->sd_dpl; 597 ssd->ssd_p = sd->sd_p; 598 ssd->ssd_long = sd->sd_long; 599 ssd->ssd_def32 = sd->sd_def32; 600 ssd->ssd_gran = sd->sd_gran; 601 } 602 603 void 604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 605 { 606 607 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 608 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 609 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 610 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 611 sd->sd_type = ssd->ssd_type; 612 sd->sd_dpl = ssd->ssd_dpl; 613 sd->sd_p = ssd->ssd_p; 614 sd->sd_long = ssd->ssd_long; 615 sd->sd_def32 = ssd->ssd_def32; 616 sd->sd_gran = ssd->ssd_gran; 617 } 618 619 void 620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd) 621 { 622 623 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 624 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 625 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 626 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 627 sd->sd_type = ssd->ssd_type; 628 sd->sd_dpl = ssd->ssd_dpl; 629 sd->sd_p = ssd->ssd_p; 630 sd->sd_gran = ssd->ssd_gran; 631 } 632 633 u_int basemem; 634 635 static int 636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 637 int *physmap_idxp) 638 { 639 int i, insert_idx, physmap_idx; 640 641 physmap_idx = *physmap_idxp; 642 643 if (length == 0) 644 return (1); 645 646 /* 647 * Find insertion point while checking for overlap. Start off by 648 * assuming the new entry will be added to the end. 649 * 650 * NB: physmap_idx points to the next free slot. 651 */ 652 insert_idx = physmap_idx; 653 for (i = 0; i < physmap_idx; i += 2) { 654 if (base < physmap[i + 1]) { 655 if (base + length <= physmap[i]) { 656 insert_idx = i; 657 break; 658 } 659 if (boothowto & RB_VERBOSE) 660 printf( 661 "Overlapping memory regions, ignoring second region\n"); 662 return (1); 663 } 664 } 665 666 /* See if we can prepend to the next entry. */ 667 if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) { 668 physmap[insert_idx] = base; 669 return (1); 670 } 671 672 /* See if we can append to the previous entry. */ 673 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 674 physmap[insert_idx - 1] += length; 675 return (1); 676 } 677 678 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 679 printf( 680 "Too many segments in the physical address map, giving up\n"); 681 return (0); 682 } 683 684 /* 685 * Move the last 'N' entries down to make room for the new 686 * entry if needed. 687 */ 688 for (i = physmap_idx; i > insert_idx; i -= 2) { 689 physmap[i] = physmap[i - 2]; 690 physmap[i + 1] = physmap[i - 1]; 691 } 692 693 physmap_idx += 2; 694 *physmap_idxp = physmap_idx; 695 696 /* Insert the new entry. */ 697 physmap[insert_idx] = base; 698 physmap[insert_idx + 1] = base + length; 699 return (1); 700 } 701 702 void 703 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 704 vm_paddr_t *physmap, int *physmap_idx) 705 { 706 struct bios_smap *smap, *smapend; 707 708 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 709 710 for (smap = smapbase; smap < smapend; smap++) { 711 if (boothowto & RB_VERBOSE) 712 printf("SMAP type=%02x base=%016lx len=%016lx\n", 713 smap->type, smap->base, smap->length); 714 715 if (smap->type != SMAP_TYPE_MEMORY) 716 continue; 717 718 if (!add_physmap_entry(smap->base, smap->length, physmap, 719 physmap_idx)) 720 break; 721 } 722 } 723 724 static void 725 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 726 int *physmap_idx) 727 { 728 struct efi_md *map, *p; 729 const char *type; 730 size_t efisz; 731 int ndesc, i; 732 733 static const char *types[] = { 734 "Reserved", 735 "LoaderCode", 736 "LoaderData", 737 "BootServicesCode", 738 "BootServicesData", 739 "RuntimeServicesCode", 740 "RuntimeServicesData", 741 "ConventionalMemory", 742 "UnusableMemory", 743 "ACPIReclaimMemory", 744 "ACPIMemoryNVS", 745 "MemoryMappedIO", 746 "MemoryMappedIOPortSpace", 747 "PalCode", 748 "PersistentMemory" 749 }; 750 751 /* 752 * Memory map data provided by UEFI via the GetMemoryMap 753 * Boot Services API. 754 */ 755 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 756 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 757 758 if (efihdr->descriptor_size == 0) 759 return; 760 ndesc = efihdr->memory_size / efihdr->descriptor_size; 761 762 if (boothowto & RB_VERBOSE) 763 printf("%23s %12s %12s %8s %4s\n", 764 "Type", "Physical", "Virtual", "#Pages", "Attr"); 765 766 TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs); 767 for (i = 0, p = map; i < ndesc; i++, 768 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 769 if (boothowto & RB_VERBOSE) { 770 if (p->md_type < nitems(types)) 771 type = types[p->md_type]; 772 else 773 type = "<INVALID>"; 774 printf("%23s %012lx %012lx %08lx ", type, p->md_phys, 775 p->md_virt, p->md_pages); 776 if (p->md_attr & EFI_MD_ATTR_UC) 777 printf("UC "); 778 if (p->md_attr & EFI_MD_ATTR_WC) 779 printf("WC "); 780 if (p->md_attr & EFI_MD_ATTR_WT) 781 printf("WT "); 782 if (p->md_attr & EFI_MD_ATTR_WB) 783 printf("WB "); 784 if (p->md_attr & EFI_MD_ATTR_UCE) 785 printf("UCE "); 786 if (p->md_attr & EFI_MD_ATTR_WP) 787 printf("WP "); 788 if (p->md_attr & EFI_MD_ATTR_RP) 789 printf("RP "); 790 if (p->md_attr & EFI_MD_ATTR_XP) 791 printf("XP "); 792 if (p->md_attr & EFI_MD_ATTR_NV) 793 printf("NV "); 794 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 795 printf("MORE_RELIABLE "); 796 if (p->md_attr & EFI_MD_ATTR_RO) 797 printf("RO "); 798 if (p->md_attr & EFI_MD_ATTR_RT) 799 printf("RUNTIME"); 800 printf("\n"); 801 } 802 803 switch (p->md_type) { 804 case EFI_MD_TYPE_BS_CODE: 805 case EFI_MD_TYPE_BS_DATA: 806 if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type)) 807 continue; 808 /* FALLTHROUGH */ 809 case EFI_MD_TYPE_CODE: 810 case EFI_MD_TYPE_DATA: 811 case EFI_MD_TYPE_FREE: 812 /* 813 * We're allowed to use any entry with these types. 814 */ 815 break; 816 default: 817 continue; 818 } 819 820 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, 821 physmap, physmap_idx)) 822 break; 823 } 824 } 825 826 static void 827 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx) 828 { 829 struct bios_smap *smap; 830 struct efi_map_header *efihdr; 831 832 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp, 833 MODINFO_METADATA | MODINFOMD_EFI_MAP); 834 smap = (struct bios_smap *)preload_search_info(preload_kmdp, 835 MODINFO_METADATA | MODINFOMD_SMAP); 836 if (efihdr == NULL && smap == NULL) 837 panic("No BIOS smap or EFI map info from loader!"); 838 839 if (efihdr != NULL) { 840 add_efi_map_entries(efihdr, physmap, physmap_idx); 841 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 842 } else { 843 /* 844 * Memory map from INT 15:E820. 845 * 846 * subr_module.c says: 847 * "Consumer may safely assume that size value precedes data." 848 * ie: an int32_t immediately precedes smap. 849 */ 850 u_int32_t size = *((u_int32_t *)smap - 1); 851 852 bios_add_smap_entries(smap, size, physmap, physmap_idx); 853 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 854 } 855 } 856 857 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 858 859 /* 860 * Populate the (physmap) array with base/bound pairs describing the 861 * available physical memory in the system, then test this memory and 862 * build the phys_avail array describing the actually-available memory. 863 * 864 * Total memory size may be set by the kernel environment variable 865 * hw.physmem or the compile-time define MAXMEM. 866 * 867 * XXX first should be vm_paddr_t. 868 */ 869 static void 870 getmemsize(u_int64_t first) 871 { 872 int i, physmap_idx, pa_indx, da_indx; 873 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 874 u_long physmem_start, physmem_tunable, memtest; 875 pt_entry_t *pte; 876 quad_t dcons_addr, dcons_size; 877 int page_counter; 878 879 TSENTER(); 880 /* 881 * Tell the physical memory allocator about pages used to store 882 * the kernel and preloaded data. See kmem_bootstrap_free(). 883 */ 884 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 885 886 bzero(physmap, sizeof(physmap)); 887 physmap_idx = 0; 888 889 init_ops.parse_memmap(physmap, &physmap_idx); 890 physmap_idx -= 2; 891 892 /* 893 * Find the 'base memory' segment for SMP 894 */ 895 basemem = 0; 896 for (i = 0; i <= physmap_idx; i += 2) { 897 if (physmap[i] <= 0xA0000) { 898 basemem = physmap[i + 1] / 1024; 899 break; 900 } 901 } 902 if (basemem == 0 || basemem > 640) { 903 if (bootverbose) 904 printf( 905 "Memory map doesn't contain a basemem segment, faking it"); 906 basemem = 640; 907 } 908 909 /* 910 * Maxmem isn't the "maximum memory", it's one larger than the 911 * highest page of the physical address space. It should be 912 * called something like "Maxphyspage". We may adjust this 913 * based on ``hw.physmem'' and the results of the memory test. 914 */ 915 Maxmem = atop(physmap[physmap_idx + 1]); 916 917 #ifdef MAXMEM 918 Maxmem = MAXMEM / 4; 919 #endif 920 921 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 922 Maxmem = atop(physmem_tunable); 923 924 /* 925 * The boot memory test is disabled by default, as it takes a 926 * significant amount of time on large-memory systems, and is 927 * unfriendly to virtual machines as it unnecessarily touches all 928 * pages. 929 * 930 * A general name is used as the code may be extended to support 931 * additional tests beyond the current "page present" test. 932 */ 933 memtest = 0; 934 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 935 936 /* 937 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 938 * in the system. 939 */ 940 if (Maxmem > atop(physmap[physmap_idx + 1])) 941 Maxmem = atop(physmap[physmap_idx + 1]); 942 943 if (atop(physmap[physmap_idx + 1]) != Maxmem && 944 (boothowto & RB_VERBOSE)) 945 printf("Physical memory use set to %ldK\n", Maxmem * 4); 946 947 /* call pmap initialization to make new kernel address space */ 948 pmap_bootstrap(&first); 949 950 /* 951 * Size up each available chunk of physical memory. 952 * 953 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 954 * By default, mask off the first 16 pages unless we appear to be 955 * running in a VM. 956 */ 957 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 958 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 959 if (physmap[0] < physmem_start) { 960 if (physmem_start < PAGE_SIZE) 961 physmap[0] = PAGE_SIZE; 962 else if (physmem_start >= physmap[1]) 963 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 964 else 965 physmap[0] = round_page(physmem_start); 966 } 967 pa_indx = 0; 968 da_indx = 1; 969 phys_avail[pa_indx++] = physmap[0]; 970 phys_avail[pa_indx] = physmap[0]; 971 dump_avail[da_indx] = physmap[0]; 972 pte = CMAP1; 973 974 /* 975 * Get dcons buffer address 976 */ 977 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 978 getenv_quad("dcons.size", &dcons_size) == 0) 979 dcons_addr = 0; 980 981 /* 982 * physmap is in bytes, so when converting to page boundaries, 983 * round up the start address and round down the end address. 984 */ 985 page_counter = 0; 986 if (memtest != 0) 987 printf("Testing system memory"); 988 for (i = 0; i <= physmap_idx; i += 2) { 989 vm_paddr_t end; 990 991 end = ptoa((vm_paddr_t)Maxmem); 992 if (physmap[i + 1] < end) 993 end = trunc_page(physmap[i + 1]); 994 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 995 int *ptr = (int *)CADDR1; 996 int tmp; 997 bool full, page_bad; 998 999 full = false; 1000 /* 1001 * block out kernel memory as not available. 1002 */ 1003 if (pa >= (vm_paddr_t)kernphys && pa < first) 1004 goto do_dump_avail; 1005 1006 /* 1007 * block out dcons buffer 1008 */ 1009 if (dcons_addr > 0 1010 && pa >= trunc_page(dcons_addr) 1011 && pa < dcons_addr + dcons_size) 1012 goto do_dump_avail; 1013 1014 page_bad = false; 1015 if (memtest == 0) 1016 goto skip_memtest; 1017 1018 /* 1019 * Print a "." every GB to show we're making 1020 * progress. 1021 */ 1022 page_counter++; 1023 if ((page_counter % PAGES_PER_GB) == 0) 1024 printf("."); 1025 1026 /* 1027 * map page into kernel: valid, read/write,non-cacheable 1028 */ 1029 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1030 invltlb(); 1031 1032 tmp = *(int *)ptr; 1033 /* 1034 * Test for alternating 1's and 0's 1035 */ 1036 *(volatile int *)ptr = 0xaaaaaaaa; 1037 if (*(volatile int *)ptr != 0xaaaaaaaa) 1038 page_bad = true; 1039 /* 1040 * Test for alternating 0's and 1's 1041 */ 1042 *(volatile int *)ptr = 0x55555555; 1043 if (*(volatile int *)ptr != 0x55555555) 1044 page_bad = true; 1045 /* 1046 * Test for all 1's 1047 */ 1048 *(volatile int *)ptr = 0xffffffff; 1049 if (*(volatile int *)ptr != 0xffffffff) 1050 page_bad = true; 1051 /* 1052 * Test for all 0's 1053 */ 1054 *(volatile int *)ptr = 0x0; 1055 if (*(volatile int *)ptr != 0x0) 1056 page_bad = true; 1057 /* 1058 * Restore original value. 1059 */ 1060 *(int *)ptr = tmp; 1061 1062 skip_memtest: 1063 /* 1064 * Adjust array of valid/good pages. 1065 */ 1066 if (page_bad == true) 1067 continue; 1068 /* 1069 * If this good page is a continuation of the 1070 * previous set of good pages, then just increase 1071 * the end pointer. Otherwise start a new chunk. 1072 * Note that "end" points one higher than end, 1073 * making the range >= start and < end. 1074 * If we're also doing a speculative memory 1075 * test and we at or past the end, bump up Maxmem 1076 * so that we keep going. The first bad page 1077 * will terminate the loop. 1078 */ 1079 if (phys_avail[pa_indx] == pa) { 1080 phys_avail[pa_indx] += PAGE_SIZE; 1081 } else { 1082 pa_indx++; 1083 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1084 printf( 1085 "Too many holes in the physical address space, giving up\n"); 1086 pa_indx--; 1087 full = true; 1088 goto do_dump_avail; 1089 } 1090 phys_avail[pa_indx++] = pa; /* start */ 1091 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1092 } 1093 physmem++; 1094 do_dump_avail: 1095 if (dump_avail[da_indx] == pa) { 1096 dump_avail[da_indx] += PAGE_SIZE; 1097 } else { 1098 da_indx++; 1099 if (da_indx == PHYS_AVAIL_ENTRIES) { 1100 da_indx--; 1101 goto do_next; 1102 } 1103 dump_avail[da_indx++] = pa; /* start */ 1104 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1105 } 1106 do_next: 1107 if (full) 1108 break; 1109 } 1110 } 1111 *pte = 0; 1112 invltlb(); 1113 if (memtest != 0) 1114 printf("\n"); 1115 1116 /* 1117 * XXX 1118 * The last chunk must contain at least one page plus the message 1119 * buffer to avoid complicating other code (message buffer address 1120 * calculation, etc.). 1121 */ 1122 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1123 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1124 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1125 phys_avail[pa_indx--] = 0; 1126 phys_avail[pa_indx--] = 0; 1127 } 1128 1129 Maxmem = atop(phys_avail[pa_indx]); 1130 1131 /* Trim off space for the message buffer. */ 1132 phys_avail[pa_indx] -= round_page(msgbufsize); 1133 1134 /* Map the message buffer. */ 1135 msgbufp = PHYS_TO_DMAP(phys_avail[pa_indx]); 1136 TSEXIT(); 1137 } 1138 1139 static void 1140 native_parse_preload_data(u_int64_t modulep) 1141 { 1142 char *envp; 1143 #ifdef DDB 1144 vm_offset_t ksym_start; 1145 vm_offset_t ksym_end; 1146 #endif 1147 1148 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1149 preload_bootstrap_relocate(KERNBASE); 1150 preload_initkmdp(true); 1151 boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int); 1152 envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *); 1153 if (envp != NULL) 1154 envp += KERNBASE; 1155 init_static_kenv(envp, 0); 1156 #ifdef DDB 1157 ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t); 1158 ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t); 1159 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1160 #endif 1161 efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE, 1162 vm_paddr_t); 1163 } 1164 1165 static void 1166 native_clock_source_init(void) 1167 { 1168 i8254_init(); 1169 } 1170 1171 static void 1172 amd64_kdb_init(void) 1173 { 1174 kdb_init(); 1175 #ifdef KDB 1176 if (boothowto & RB_KDB) 1177 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1178 #endif 1179 } 1180 1181 /* Set up the fast syscall stuff */ 1182 void 1183 amd64_conf_fast_syscall(void) 1184 { 1185 uint64_t msr; 1186 1187 msr = rdmsr(MSR_EFER) | EFER_SCE; 1188 wrmsr(MSR_EFER, msr); 1189 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1190 (u_int64_t)IDTVEC(fast_syscall)); 1191 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1192 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1193 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1194 wrmsr(MSR_STAR, msr); 1195 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1196 } 1197 1198 void 1199 amd64_bsp_pcpu_init1(struct pcpu *pc) 1200 { 1201 struct user_segment_descriptor *gdt; 1202 1203 PCPU_SET(prvspace, pc); 1204 gdt = *PCPU_PTR(gdt); 1205 PCPU_SET(curthread, &thread0); 1206 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1207 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1208 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1209 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1210 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1211 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1212 PCPU_SET(smp_tlb_gen, 1); 1213 } 1214 1215 void 1216 amd64_bsp_pcpu_init2(uint64_t rsp0) 1217 { 1218 1219 PCPU_SET(rsp0, rsp0); 1220 PCPU_SET(pti_rsp0, STACKALIGN((vm_offset_t)PCPU_PTR(pti_stack) + 1221 PC_PTI_STACK_SZ * sizeof(uint64_t))); 1222 PCPU_SET(curpcb, thread0.td_pcb); 1223 } 1224 1225 void 1226 amd64_bsp_ist_init(struct pcpu *pc) 1227 { 1228 struct nmi_pcpu *np; 1229 struct amd64tss *tssp; 1230 1231 tssp = &pc->pc_common_tss; 1232 1233 /* doublefault stack space, runs on ist1 */ 1234 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1235 np->np_pcpu = (register_t)pc; 1236 tssp->tss_ist1 = (long)np; 1237 1238 /* 1239 * NMI stack, runs on ist2. The pcpu pointer is stored just 1240 * above the start of the ist2 stack. 1241 */ 1242 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1243 np->np_pcpu = (register_t)pc; 1244 tssp->tss_ist2 = (long)np; 1245 1246 /* 1247 * MC# stack, runs on ist3. The pcpu pointer is stored just 1248 * above the start of the ist3 stack. 1249 */ 1250 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1251 np->np_pcpu = (register_t)pc; 1252 tssp->tss_ist3 = (long)np; 1253 1254 /* 1255 * DB# stack, runs on ist4. 1256 */ 1257 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1258 np->np_pcpu = (register_t)pc; 1259 tssp->tss_ist4 = (long)np; 1260 } 1261 1262 /* 1263 * Calculate the kernel load address by inspecting page table created by loader. 1264 * The assumptions: 1265 * - kernel is mapped at KERNBASE, backed by contiguous phys memory 1266 * aligned at 2M, below 4G (the latter is important for AP startup) 1267 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) 1268 * - kernel is mapped with 2M superpages 1269 * - all participating memory, i.e. kernel, modules, metadata, 1270 * page table is accessible by pre-created 1:1 mapping 1271 * (right now loader creates 1:1 mapping for lower 4G, and all 1272 * memory is from there) 1273 * - there is a usable memory block right after the end of the 1274 * mapped kernel and all modules/metadata, pointed to by 1275 * physfree, for early allocations 1276 * 1277 * The memory block after the end of the kernel is important, loader 1278 * must ensure that no critical data structures are put there. Among 1279 * them is the trampoline page table, which must not be overwritten by 1280 * the allocations until pmap_bootstrap() switches %cr3 to the initial 1281 * version of the kernel page table. Size of the block is controlled 1282 * by the 'staging_slop' command for loader.efi. 1283 */ 1284 vm_paddr_t __nosanitizeaddress __nosanitizememory 1285 amd64_loadaddr(void) 1286 { 1287 pml4_entry_t *pml4e; 1288 pdp_entry_t *pdpe; 1289 pd_entry_t *pde; 1290 uint64_t cr3; 1291 1292 cr3 = rcr3(); 1293 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART); 1294 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART); 1295 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART); 1296 return (*pde & PG_FRAME); 1297 } 1298 1299 u_int64_t 1300 hammer_time(u_int64_t modulep, u_int64_t physfree) 1301 { 1302 int gsel_tss, x; 1303 struct pcpu *pc; 1304 uint64_t rsp0; 1305 char *env; 1306 struct user_segment_descriptor *gdt; 1307 struct region_descriptor r_gdt; 1308 size_t kstack0_sz; 1309 1310 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1311 1312 kernphys = amd64_loadaddr(); 1313 1314 physfree += kernphys; 1315 1316 /* Initializes preload_kmdp */ 1317 init_ops.parse_preload_data(modulep); 1318 1319 efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA | 1320 MODINFOMD_EFI_MAP) != NULL; 1321 1322 if (!efi_boot) { 1323 /* Tell the bios to warmboot next time */ 1324 atomic_store_short((u_short *)0x472, 0x1234); 1325 } 1326 1327 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART); 1328 physfree = roundup2(physfree, PAGE_SIZE); 1329 1330 identify_cpu1(); 1331 identify_hypervisor(); 1332 identify_hypervisor_smbios(); 1333 identify_cpu_fixup_bsp(); 1334 identify_cpu2(); 1335 initializecpucache(); 1336 1337 /* 1338 * Check for pti, pcid, and invpcid before ifuncs are 1339 * resolved, to correctly select the implementation for 1340 * pmap_activate_sw_mode(). 1341 */ 1342 pti = pti_get_default(); 1343 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1344 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1345 if ((cpu_feature2 & CPUID2_PCID) == 0) 1346 pmap_pcid_enabled = 0; 1347 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; 1348 1349 /* 1350 * Now we can do small core initialization, after the PCID 1351 * CPU features and user knobs are evaluated. 1352 */ 1353 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", 1354 &pmap_pcid_invlpg_workaround_uena); 1355 cpu_init_small_core(); 1356 1357 if ((cpu_feature2 & CPUID2_XSAVE) != 0) { 1358 use_xsave = 1; 1359 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); 1360 } 1361 1362 sched_instance_select(); 1363 1364 link_elf_ireloc(); 1365 1366 /* 1367 * This may be done better later if it gets more high level 1368 * components in it. If so just link td->td_proc here. 1369 */ 1370 proc_linkup0(&proc0, &thread0); 1371 1372 /* Init basic tunables, hz etc */ 1373 init_param1(); 1374 1375 thread0.td_kstack = (char *)physfree - kernphys + KERNSTART; 1376 thread0.td_kstack_pages = kstack_pages; 1377 kstack0_sz = ptoa(kstack_pages); 1378 bzero(thread0.td_kstack, kstack0_sz); 1379 cpu_thread_new_kstack(&thread0); 1380 physfree += kstack0_sz; 1381 1382 /* 1383 * Initialize enough of thread0 for delayed invalidation to 1384 * work very early. Rely on thread0.td_base_pri 1385 * zero-initialization, it is reset to PVM at proc0_init(). 1386 */ 1387 pmap_thread_init_invl_gen(&thread0); 1388 1389 pc = &temp_bsp_pcpu; 1390 pcpu_init(pc, 0, sizeof(struct pcpu)); 1391 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1392 1393 /* 1394 * make gdt memory segments 1395 */ 1396 for (x = 0; x < NGDT; x++) { 1397 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1398 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 1399 ssdtosd(&gdt_segs[x], &gdt[x]); 1400 } 1401 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1402 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1403 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1404 1405 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1406 r_gdt.rd_base = (long)gdt; 1407 lgdt(&r_gdt); 1408 1409 wrmsr(MSR_FSBASE, 0); /* User value */ 1410 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1411 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1412 1413 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); 1414 physfree += DPCPU_SIZE; 1415 amd64_bsp_pcpu_init1(pc); 1416 /* Non-late cninit() and printf() can be moved up to here. */ 1417 1418 /* 1419 * Initialize mutexes. 1420 * 1421 * icu_lock: in order to allow an interrupt to occur in a critical 1422 * section, to set pcpu->ipending (etc...) properly, we 1423 * must be able to get the icu lock, so it can't be 1424 * under witness. 1425 */ 1426 mutex_init(); 1427 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1428 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1429 1430 /* exceptions */ 1431 for (x = 0; x < NIDT; x++) 1432 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1433 SEL_KPL, 0); 1434 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1435 SEL_KPL, 0); 1436 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1437 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1438 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1439 SEL_UPL, 0); 1440 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1441 SEL_UPL, 0); 1442 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1443 SEL_KPL, 0); 1444 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1445 SEL_KPL, 0); 1446 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1447 SEL_KPL, 0); 1448 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1449 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1450 SDT_SYSIGT, SEL_KPL, 0); 1451 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1452 SEL_KPL, 0); 1453 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1454 SDT_SYSIGT, SEL_KPL, 0); 1455 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1456 SEL_KPL, 0); 1457 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1458 SEL_KPL, 0); 1459 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1460 SEL_KPL, 0); 1461 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1462 SEL_KPL, 0); 1463 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1464 SEL_KPL, 0); 1465 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1466 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1467 SEL_KPL, 0); 1468 #ifdef KDTRACE_HOOKS 1469 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1470 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1471 #endif 1472 #ifdef XENHVM 1473 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1474 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1475 #endif 1476 r_idt.rd_limit = sizeof(idt0) - 1; 1477 r_idt.rd_base = (long) idt; 1478 lidt(&r_idt); 1479 1480 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1481 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1482 1483 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1484 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1485 1486 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d", 1487 &syscall_ret_l1d_flush_mode); 1488 1489 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1490 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1491 1492 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1493 1494 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable", 1495 &x86_rngds_mitg_enable); 1496 1497 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable", 1498 &zenbleed_enable); 1499 zenbleed_sanitize_enable(); 1500 1501 finishidentcpu(); /* Final stage of CPU initialization */ 1502 1503 invlpgb_works = (amd_extended_feature_extensions & 1504 AMDFEID_INVLPGB) != 0; 1505 TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works); 1506 if (invlpgb_works) 1507 invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT; 1508 1509 /* 1510 * Initialize the clock before the console so that console 1511 * initialization can use DELAY(). 1512 */ 1513 clock_init(); 1514 1515 initializecpu(); /* Initialize CPU registers */ 1516 1517 amd64_bsp_ist_init(pc); 1518 1519 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1520 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1521 IOPERM_BITMAP_SIZE; 1522 1523 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1524 ltr(gsel_tss); 1525 1526 amd64_conf_fast_syscall(); 1527 1528 /* 1529 * We initialize the PCB pointer early so that exception 1530 * handlers will work. 1531 */ 1532 thread0.td_pcb = get_pcb_td(&thread0); 1533 1534 /* 1535 * The console and kdb should be initialized even earlier than here, 1536 * but some console drivers don't work until after getmemsize(). 1537 * Default to late console initialization to support these drivers. 1538 * This loses mainly printf()s in getmemsize() and early debugging. 1539 */ 1540 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1541 if (!late_console) { 1542 cninit(); 1543 amd64_kdb_init(); 1544 } 1545 1546 getmemsize(physfree); 1547 init_param2(physmem); 1548 1549 /* now running on new page tables, configured,and u/iom is accessible */ 1550 1551 #ifdef DEV_PCI 1552 /* This call might adjust phys_avail[]. */ 1553 pci_early_quirks(); 1554 #endif 1555 1556 if (late_console) 1557 cninit(); 1558 1559 /* 1560 * Dump the boot metadata. We have to wait for cninit() since console 1561 * output is required. If it's grossly incorrect the kernel will never 1562 * make it this far. 1563 */ 1564 if (getenv_is_true("debug.dump_modinfo_at_boot")) 1565 preload_dump(); 1566 1567 #ifdef DEV_ISA 1568 #ifdef DEV_ATPIC 1569 elcr_probe(); 1570 atpic_startup(); 1571 #else 1572 /* Reset and mask the atpics and leave them shut down. */ 1573 atpic_reset(); 1574 1575 /* 1576 * Point the ICU spurious interrupt vectors at the APIC spurious 1577 * interrupt handler. 1578 */ 1579 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1580 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1581 #endif 1582 #else 1583 #error "have you forgotten the isa device?" 1584 #endif 1585 1586 if (late_console) 1587 amd64_kdb_init(); 1588 1589 msgbufinit(msgbufp, msgbufsize); 1590 fpuinit(); 1591 1592 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1593 rsp0 = (uintptr_t)thread0.td_md.md_stack_base; 1594 /* Ensure the stack is aligned to 16 bytes */ 1595 rsp0 = STACKALIGN(rsp0); 1596 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1597 amd64_bsp_pcpu_init2(rsp0); 1598 1599 /* transfer to user mode */ 1600 1601 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1602 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1603 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1604 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1605 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1606 1607 load_ds(_udatasel); 1608 load_es(_udatasel); 1609 load_fs(_ufssel); 1610 1611 /* setup proc 0's pcb */ 1612 thread0.td_pcb->pcb_flags = 0; 1613 1614 env = kern_getenv("kernelname"); 1615 if (env != NULL) 1616 strlcpy(kernelname, env, sizeof(kernelname)); 1617 1618 kcsan_cpu_init(0); 1619 1620 #ifdef FDT 1621 x86_init_fdt(); 1622 #endif 1623 1624 kasan_init(); 1625 kmsan_init(); 1626 1627 TSEXIT(); 1628 1629 /* Location of kernel stack for locore */ 1630 return ((uintptr_t)thread0.td_md.md_stack_base); 1631 } 1632 1633 void 1634 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1635 { 1636 1637 pcpu->pc_acpi_id = 0xffffffff; 1638 } 1639 1640 static int 1641 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1642 { 1643 struct bios_smap *smapbase; 1644 struct bios_smap_xattr smap; 1645 uint32_t *smapattr; 1646 int count, error, i; 1647 1648 /* Retrieve the system memory map from the loader. */ 1649 smapbase = (struct bios_smap *)preload_search_info(preload_kmdp, 1650 MODINFO_METADATA | MODINFOMD_SMAP); 1651 if (smapbase == NULL) 1652 return (0); 1653 smapattr = (uint32_t *)preload_search_info(preload_kmdp, 1654 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1655 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1656 error = 0; 1657 for (i = 0; i < count; i++) { 1658 smap.base = smapbase[i].base; 1659 smap.length = smapbase[i].length; 1660 smap.type = smapbase[i].type; 1661 if (smapattr != NULL) 1662 smap.xattr = smapattr[i]; 1663 else 1664 smap.xattr = 0; 1665 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1666 } 1667 return (error); 1668 } 1669 SYSCTL_PROC(_machdep, OID_AUTO, smap, 1670 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1671 smap_sysctl_handler, "S,bios_smap_xattr", 1672 "Raw BIOS SMAP data"); 1673 1674 static int 1675 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1676 { 1677 struct efi_map_header *efihdr; 1678 uint32_t efisize; 1679 1680 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp, 1681 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1682 if (efihdr == NULL) 1683 return (0); 1684 efisize = *((uint32_t *)efihdr - 1); 1685 return (SYSCTL_OUT(req, efihdr, efisize)); 1686 } 1687 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1688 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1689 efi_map_sysctl_handler, "S,efi_map_header", 1690 "Raw EFI Memory Map"); 1691 1692 static int 1693 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS) 1694 { 1695 char *arch; 1696 1697 arch = (char *)preload_search_info(preload_kmdp, 1698 MODINFO_METADATA | MODINFOMD_EFI_ARCH); 1699 if (arch == NULL) 1700 return (0); 1701 1702 return (SYSCTL_OUT_STR(req, arch)); 1703 } 1704 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch, 1705 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1706 efi_arch_sysctl_handler, "A", "EFI Firmware Architecture"); 1707 1708 void 1709 spinlock_enter(void) 1710 { 1711 struct thread *td; 1712 register_t flags; 1713 1714 td = curthread; 1715 if (td->td_md.md_spinlock_count == 0) { 1716 flags = intr_disable(); 1717 td->td_md.md_spinlock_count = 1; 1718 td->td_md.md_saved_flags = flags; 1719 critical_enter(); 1720 } else 1721 td->td_md.md_spinlock_count++; 1722 } 1723 1724 void 1725 spinlock_exit(void) 1726 { 1727 struct thread *td; 1728 register_t flags; 1729 1730 td = curthread; 1731 flags = td->td_md.md_saved_flags; 1732 td->td_md.md_spinlock_count--; 1733 if (td->td_md.md_spinlock_count == 0) { 1734 critical_exit(); 1735 intr_restore(flags); 1736 } 1737 } 1738 1739 /* 1740 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1741 * we want to start a backtrace from the function that caused us to enter 1742 * the debugger. We have the context in the trapframe, but base the trace 1743 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1744 * enough for a backtrace. 1745 */ 1746 void 1747 makectx(struct trapframe *tf, struct pcb *pcb) 1748 { 1749 1750 pcb->pcb_r12 = tf->tf_r12; 1751 pcb->pcb_r13 = tf->tf_r13; 1752 pcb->pcb_r14 = tf->tf_r14; 1753 pcb->pcb_r15 = tf->tf_r15; 1754 pcb->pcb_rbp = tf->tf_rbp; 1755 pcb->pcb_rbx = tf->tf_rbx; 1756 pcb->pcb_rip = tf->tf_rip; 1757 pcb->pcb_rsp = tf->tf_rsp; 1758 } 1759 1760 /* 1761 * The pcb_flags is only modified by current thread, or by other threads 1762 * when current thread is stopped. However, current thread may change it 1763 * from the interrupt context in cpu_switch(), or in the trap handler. 1764 * When we read-modify-write pcb_flags from C sources, compiler may generate 1765 * code that is not atomic regarding the interrupt handler. If a trap or 1766 * interrupt happens and any flag is modified from the handler, it can be 1767 * clobbered with the cached value later. Therefore, we implement setting 1768 * and clearing flags with single-instruction functions, which do not race 1769 * with possible modification of the flags from the trap or interrupt context, 1770 * because traps and interrupts are executed only on instruction boundary. 1771 */ 1772 void 1773 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 1774 { 1775 1776 __asm __volatile("orl %1,%0" 1777 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 1778 : "cc", "memory"); 1779 1780 } 1781 1782 /* 1783 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 1784 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 1785 * pcb if user space modified the bases. We must save on the context 1786 * switch or if the return to usermode happens through the doreti. 1787 * 1788 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 1789 * which have a consequence that the base MSRs must be saved each time 1790 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 1791 * context switches. 1792 */ 1793 static void 1794 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 1795 { 1796 register_t r; 1797 1798 if (curpcb == pcb && 1799 (flags & PCB_FULL_IRET) != 0 && 1800 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 1801 r = intr_disable(); 1802 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 1803 pcb->pcb_fsbase = rdfsbase(); 1804 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 1805 } 1806 set_pcb_flags_raw(pcb, flags); 1807 intr_restore(r); 1808 } else { 1809 set_pcb_flags_raw(pcb, flags); 1810 } 1811 } 1812 1813 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 1814 { 1815 1816 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 1817 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 1818 } 1819 1820 void 1821 clear_pcb_flags(struct pcb *pcb, const u_int flags) 1822 { 1823 1824 __asm __volatile("andl %1,%0" 1825 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 1826 : "cc", "memory"); 1827 } 1828 1829 extern const char wrmsr_early_safe_gp_handler[]; 1830 1831 void 1832 wrmsr_early_safe_start(void) 1833 { 1834 struct region_descriptor efi_idt; 1835 struct gate_descriptor *gpf_descr; 1836 int i; 1837 1838 efi_idt.rd_limit = 32 * sizeof(idt0[0]); 1839 efi_idt.rd_base = (uintptr_t)idt0; 1840 lidt(&efi_idt); 1841 1842 /* Setup handler for all possible exceptions. */ 1843 for (i = 0; i < 32; i++) { 1844 gpf_descr = &idt0[i]; 1845 gpf_descr->gd_looffset = 1846 (uintptr_t)wrmsr_early_safe_gp_handler; 1847 gpf_descr->gd_hioffset = 1848 (uintptr_t)wrmsr_early_safe_gp_handler >> 16; 1849 gpf_descr->gd_selector = rcs(); 1850 gpf_descr->gd_type = SDT_SYSTGT; 1851 gpf_descr->gd_p = 1; 1852 } 1853 } 1854 1855 void 1856 wrmsr_early_safe_end(void) 1857 { 1858 } 1859 1860 int 1861 safe_read(vm_offset_t addr, char *valp) 1862 { 1863 struct uio uio; 1864 struct iovec iov; 1865 1866 iov.iov_base = valp; 1867 iov.iov_len = 1; 1868 uio.uio_offset = addr; 1869 uio.uio_iov = &iov; 1870 uio.uio_iovcnt = 1; 1871 uio.uio_resid = 1; 1872 uio.uio_segflg = UIO_SYSSPACE; 1873 uio.uio_rw = UIO_READ; 1874 uio.uio_td = NULL; 1875 return (uiomove_mem(UIO_MEM_KMEM, &uio)); 1876 } 1877 1878 #ifdef KDB 1879 1880 /* 1881 * Provide inb() and outb() as functions. They are normally only available as 1882 * inline functions, thus cannot be called from the debugger. 1883 */ 1884 1885 /* silence compiler warnings */ 1886 u_char inb_(u_short); 1887 void outb_(u_short, u_char); 1888 1889 u_char 1890 inb_(u_short port) 1891 { 1892 return inb(port); 1893 } 1894 1895 void 1896 outb_(u_short port, u_char data) 1897 { 1898 outb(port, data); 1899 } 1900 1901 #endif /* KDB */ 1902 1903 #undef memset 1904 #undef memmove 1905 #undef memcpy 1906 1907 void *memset_std(void *buf, int c, size_t len); 1908 void *memset_erms(void *buf, int c, size_t len); 1909 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 1910 size_t len); 1911 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 1912 size_t len); 1913 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 1914 size_t len); 1915 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 1916 size_t len); 1917 1918 #ifdef KCSAN 1919 /* 1920 * These fail to build as ifuncs when used with KCSAN. 1921 */ 1922 void * 1923 memset(void *buf, int c, size_t len) 1924 { 1925 1926 return (memset_std(buf, c, len)); 1927 } 1928 1929 void * 1930 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 1931 { 1932 1933 return (memmove_std(dst, src, len)); 1934 } 1935 1936 void * 1937 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 1938 { 1939 1940 return (memcpy_std(dst, src, len)); 1941 } 1942 #else 1943 DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 1944 { 1945 1946 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1947 memset_erms : memset_std); 1948 } 1949 1950 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 1951 size_t)) 1952 { 1953 1954 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1955 memmove_erms : memmove_std); 1956 } 1957 1958 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 1959 { 1960 1961 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1962 memcpy_erms : memcpy_std); 1963 } 1964 #endif 1965 1966 void pagezero_std(void *addr); 1967 void pagezero_erms(void *addr); 1968 DEFINE_IFUNC(, void , pagezero, (void *)) 1969 { 1970 1971 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 1972 pagezero_erms : pagezero_std); 1973 } 1974