1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #include "opt_atpic.h"
42 #include "opt_cpu.h"
43 #include "opt_ddb.h"
44 #include "opt_inet.h"
45 #include "opt_isa.h"
46 #include "opt_kstack_pages.h"
47 #include "opt_maxmem.h"
48 #include "opt_pci.h"
49 #include "opt_platform.h"
50 #include "opt_sched.h"
51
52 #include <sys/param.h>
53 #include <sys/proc.h>
54 #include <sys/systm.h>
55 #include <sys/asan.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/bus.h>
59 #include <sys/callout.h>
60 #include <sys/cons.h>
61 #include <sys/cpu.h>
62 #include <sys/csan.h>
63 #include <sys/efi.h>
64 #include <sys/eventhandler.h>
65 #include <sys/exec.h>
66 #include <sys/imgact.h>
67 #include <sys/kdb.h>
68 #include <sys/kernel.h>
69 #include <sys/ktr.h>
70 #include <sys/linker.h>
71 #include <sys/lock.h>
72 #include <sys/malloc.h>
73 #include <sys/memrange.h>
74 #include <sys/msan.h>
75 #include <sys/msgbuf.h>
76 #include <sys/mutex.h>
77 #include <sys/pcpu.h>
78 #include <sys/ptrace.h>
79 #include <sys/reboot.h>
80 #include <sys/reg.h>
81 #include <sys/rwlock.h>
82 #include <sys/sched.h>
83 #include <sys/signalvar.h>
84 #include <sys/smp.h>
85 #include <sys/syscallsubr.h>
86 #include <sys/sysctl.h>
87 #include <sys/sysent.h>
88 #include <sys/sysproto.h>
89 #include <sys/ucontext.h>
90 #include <sys/vmmeter.h>
91
92 #include <vm/vm.h>
93 #include <vm/vm_param.h>
94 #include <vm/vm_extern.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_pager.h>
100 #include <vm/vm_phys.h>
101 #include <vm/vm_dumpset.h>
102
103 #ifdef DDB
104 #ifndef KDB
105 #error KDB must be enabled in order for DDB to work!
106 #endif
107 #include <ddb/ddb.h>
108 #include <ddb/db_sym.h>
109 #endif
110
111 #include <net/netisr.h>
112
113 #include <dev/smbios/smbios.h>
114
115 #include <machine/clock.h>
116 #include <machine/cpu.h>
117 #include <machine/cputypes.h>
118 #include <machine/frame.h>
119 #include <machine/intr_machdep.h>
120 #include <x86/mca.h>
121 #include <machine/md_var.h>
122 #include <machine/metadata.h>
123 #include <machine/pc/bios.h>
124 #include <machine/pcb.h>
125 #include <machine/proc.h>
126 #include <machine/sigframe.h>
127 #include <machine/specialreg.h>
128 #include <machine/trap.h>
129 #include <machine/tss.h>
130 #include <x86/ucode.h>
131 #include <x86/ifunc.h>
132 #include <machine/smp.h>
133 #ifdef FDT
134 #include <x86/fdt.h>
135 #endif
136
137 #ifdef DEV_ATPIC
138 #include <x86/isa/icu.h>
139 #else
140 #include <x86/apicvar.h>
141 #endif
142
143 #include <isa/isareg.h>
144 #include <isa/rtc.h>
145 #include <x86/init.h>
146
147 #ifndef SMP
148 #error amd64 requires options SMP
149 #endif
150
151 /* Sanity check for __curthread() */
152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153
154 /*
155 * The PTI trampoline stack needs enough space for a hardware trapframe and a
156 * couple of scratch registers, as well as the trapframe left behind after an
157 * iret fault.
158 */
159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160 offsetof(struct pti_frame, pti_rip));
161
162 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163
164 static void cpu_startup(void *);
165 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
166
167 /* Probe 8254 PIT and TSC. */
168 static void native_clock_source_init(void);
169
170 /* Preload data parse function */
171 static void native_parse_preload_data(u_int64_t);
172
173 /* Native function to fetch and parse the e820 map */
174 static void native_parse_memmap(vm_paddr_t *, int *);
175
176 /* Default init_ops implementation. */
177 struct init_ops init_ops = {
178 .parse_preload_data = native_parse_preload_data,
179 .early_clock_source_init = native_clock_source_init,
180 .early_delay = i8254_delay,
181 .parse_memmap = native_parse_memmap,
182 };
183
184 /*
185 * Physical address of the EFI System Table. Stashed from the metadata hints
186 * passed into the kernel and used by the EFI code to call runtime services.
187 */
188 vm_paddr_t efi_systbl_phys;
189
190 /*
191 * Bitmap of extra EFI memory region types that should be preserved and mapped
192 * during runtime services calls.
193 */
194 uint32_t efi_map_regs;
195
196 /* Intel ICH registers */
197 #define ICH_PMBASE 0x400
198 #define ICH_SMI_EN ICH_PMBASE + 0x30
199
200 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
201
202 int cold = 1;
203
204 long Maxmem = 0;
205 long realmem = 0;
206 int late_console = 1;
207
208 struct kva_md_info kmi;
209
210 struct region_descriptor r_idt;
211
212 struct pcpu *__pcpu;
213 struct pcpu temp_bsp_pcpu;
214
215 struct mtx icu_lock;
216
217 struct mem_range_softc mem_range_softc;
218
219 struct mtx dt_lock; /* lock for GDT and LDT */
220
221 void (*vmm_suspend_p)(void);
222 void (*vmm_resume_p)(void);
223
224 bool efi_boot;
225
226 static void
cpu_startup(void * dummy)227 cpu_startup(void *dummy)
228 {
229 uintmax_t memsize;
230 char *sysenv;
231
232 /*
233 * On MacBooks, we need to disallow the legacy USB circuit to
234 * generate an SMI# because this can cause several problems,
235 * namely: incorrect CPU frequency detection and failure to
236 * start the APs.
237 * We do this by disabling a bit in the SMI_EN (SMI Control and
238 * Enable register) of the Intel ICH LPC Interface Bridge.
239 */
240 sysenv = kern_getenv("smbios.system.product");
241 if (sysenv != NULL) {
242 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
243 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
244 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
245 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
246 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
247 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
248 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
249 strncmp(sysenv, "Macmini1,1", 10) == 0) {
250 if (bootverbose)
251 printf("Disabling LEGACY_USB_EN bit on "
252 "Intel ICH.\n");
253 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
254 }
255 freeenv(sysenv);
256 }
257
258 /*
259 * Good {morning,afternoon,evening,night}.
260 */
261 startrtclock();
262 printcpuinfo();
263
264 /*
265 * Display physical memory if SMBIOS reports reasonable amount.
266 */
267 memsize = 0;
268 sysenv = kern_getenv("smbios.memory.enabled");
269 if (sysenv != NULL) {
270 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
271 freeenv(sysenv);
272 }
273 if (memsize < ptoa((uintmax_t)vm_free_count()))
274 memsize = ptoa((uintmax_t)Maxmem);
275 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
276 realmem = atop(memsize);
277
278 /*
279 * Display any holes after the first chunk of extended memory.
280 */
281 if (bootverbose) {
282 int indx;
283
284 printf("Physical memory chunk(s):\n");
285 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
286 vm_paddr_t size;
287
288 size = phys_avail[indx + 1] - phys_avail[indx];
289 printf(
290 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
291 (uintmax_t)phys_avail[indx],
292 (uintmax_t)phys_avail[indx + 1] - 1,
293 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
294 }
295 }
296
297 vm_ksubmap_init(&kmi);
298
299 printf("avail memory = %ju (%ju MB)\n",
300 ptoa((uintmax_t)vm_free_count()),
301 ptoa((uintmax_t)vm_free_count()) / 1048576);
302 #ifdef DEV_PCI
303 if (bootverbose && intel_graphics_stolen_base != 0)
304 printf("intel stolen mem: base %#jx size %ju MB\n",
305 (uintmax_t)intel_graphics_stolen_base,
306 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
307 #endif
308
309 /*
310 * Set up buffers, so they can be used to read disk labels.
311 */
312 bufinit();
313 vm_pager_bufferinit();
314
315 cpu_setregs();
316 }
317
318 static void
late_ifunc_resolve(void * dummy __unused)319 late_ifunc_resolve(void *dummy __unused)
320 {
321 link_elf_late_ireloc();
322 }
323 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
324
325
326 void
cpu_setregs(void)327 cpu_setregs(void)
328 {
329 register_t cr0;
330
331 TSENTER();
332 cr0 = rcr0();
333 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 TSENTER2("load_cr0");
335 load_cr0(cr0);
336 TSEXIT2("load_cr0");
337 TSEXIT();
338 }
339
340 /*
341 * Initialize amd64 and configure to run kernel
342 */
343
344 /*
345 * Initialize segments & interrupt table
346 */
347 static struct gate_descriptor idt0[NIDT];
348 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
349
350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
354 CTASSERT(sizeof(struct nmi_pcpu) == 16);
355
356 /*
357 * Software prototypes -- in more palatable form.
358 *
359 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
360 * slots as corresponding segments for i386 kernel.
361 */
362 struct soft_segment_descriptor gdt_segs[] = {
363 [GNULL_SEL] = { /* 0 Null Descriptor */
364 .ssd_base = 0x0,
365 .ssd_limit = 0x0,
366 .ssd_type = 0,
367 .ssd_dpl = 0,
368 .ssd_p = 0,
369 .ssd_long = 0,
370 .ssd_def32 = 0,
371 .ssd_gran = 0 },
372 [GNULL2_SEL] = { /* 1 Null Descriptor */
373 .ssd_base = 0x0,
374 .ssd_limit = 0x0,
375 .ssd_type = 0,
376 .ssd_dpl = 0,
377 .ssd_p = 0,
378 .ssd_long = 0,
379 .ssd_def32 = 0,
380 .ssd_gran = 0 },
381 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
382 .ssd_base = 0x0,
383 .ssd_limit = 0xfffff,
384 .ssd_type = SDT_MEMRWA,
385 .ssd_dpl = SEL_UPL,
386 .ssd_p = 1,
387 .ssd_long = 0,
388 .ssd_def32 = 1,
389 .ssd_gran = 1 },
390 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
391 .ssd_base = 0x0,
392 .ssd_limit = 0xfffff,
393 .ssd_type = SDT_MEMRWA,
394 .ssd_dpl = SEL_UPL,
395 .ssd_p = 1,
396 .ssd_long = 0,
397 .ssd_def32 = 1,
398 .ssd_gran = 1 },
399 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
400 .ssd_base = 0x0,
401 .ssd_limit = 0xfffff,
402 .ssd_type = SDT_MEMERA,
403 .ssd_dpl = SEL_KPL,
404 .ssd_p = 1,
405 .ssd_long = 1,
406 .ssd_def32 = 0,
407 .ssd_gran = 1 },
408 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
409 .ssd_base = 0x0,
410 .ssd_limit = 0xfffff,
411 .ssd_type = SDT_MEMRWA,
412 .ssd_dpl = SEL_KPL,
413 .ssd_p = 1,
414 .ssd_long = 1,
415 .ssd_def32 = 0,
416 .ssd_gran = 1 },
417 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
418 .ssd_base = 0x0,
419 .ssd_limit = 0xfffff,
420 .ssd_type = SDT_MEMERA,
421 .ssd_dpl = SEL_UPL,
422 .ssd_p = 1,
423 .ssd_long = 0,
424 .ssd_def32 = 1,
425 .ssd_gran = 1 },
426 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
427 .ssd_base = 0x0,
428 .ssd_limit = 0xfffff,
429 .ssd_type = SDT_MEMRWA,
430 .ssd_dpl = SEL_UPL,
431 .ssd_p = 1,
432 .ssd_long = 0,
433 .ssd_def32 = 1,
434 .ssd_gran = 1 },
435 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
436 .ssd_base = 0x0,
437 .ssd_limit = 0xfffff,
438 .ssd_type = SDT_MEMERA,
439 .ssd_dpl = SEL_UPL,
440 .ssd_p = 1,
441 .ssd_long = 1,
442 .ssd_def32 = 0,
443 .ssd_gran = 1 },
444 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
445 .ssd_base = 0x0,
446 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
447 .ssd_type = SDT_SYSTSS,
448 .ssd_dpl = SEL_KPL,
449 .ssd_p = 1,
450 .ssd_long = 0,
451 .ssd_def32 = 0,
452 .ssd_gran = 0 },
453 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
454 .ssd_base = 0x0,
455 .ssd_limit = 0x0,
456 .ssd_type = 0,
457 .ssd_dpl = 0,
458 .ssd_p = 0,
459 .ssd_long = 0,
460 .ssd_def32 = 0,
461 .ssd_gran = 0 },
462 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
463 .ssd_base = 0x0,
464 .ssd_limit = 0x0,
465 .ssd_type = 0,
466 .ssd_dpl = 0,
467 .ssd_p = 0,
468 .ssd_long = 0,
469 .ssd_def32 = 0,
470 .ssd_gran = 0 },
471 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
472 .ssd_base = 0x0,
473 .ssd_limit = 0x0,
474 .ssd_type = 0,
475 .ssd_dpl = 0,
476 .ssd_p = 0,
477 .ssd_long = 0,
478 .ssd_def32 = 0,
479 .ssd_gran = 0 },
480 };
481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
482
483 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
485 {
486 struct gate_descriptor *ip;
487
488 ip = idt + idx;
489 ip->gd_looffset = (uintptr_t)func;
490 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
491 ip->gd_ist = ist;
492 ip->gd_xx = 0;
493 ip->gd_type = typ;
494 ip->gd_dpl = dpl;
495 ip->gd_p = 1;
496 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
497 }
498
499 extern inthand_t
500 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
501 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
502 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
503 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
504 IDTVEC(xmm), IDTVEC(dblfault),
505 IDTVEC(div_pti), IDTVEC(bpt_pti),
506 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
507 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
508 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
509 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
510 IDTVEC(xmm_pti),
511 #ifdef KDTRACE_HOOKS
512 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
513 #endif
514 #ifdef XENHVM
515 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
516 #endif
517 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
518 IDTVEC(fast_syscall_pti);
519
520 #ifdef DDB
521 /*
522 * Display the index and function name of any IDT entries that don't use
523 * the default 'rsvd' entry point.
524 */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
526 {
527 struct gate_descriptor *ip;
528 int idx;
529 uintptr_t func;
530
531 ip = idt;
532 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
533 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
534 if (func != (uintptr_t)&IDTVEC(rsvd)) {
535 db_printf("%3d\t", idx);
536 db_printsym(func, DB_STGY_PROC);
537 db_printf("\n");
538 }
539 ip++;
540 }
541 }
542
543 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
545 {
546 struct {
547 uint16_t limit;
548 uint64_t base;
549 } __packed idtr, gdtr;
550 uint16_t ldt, tr;
551
552 __asm __volatile("sidt %0" : "=m" (idtr));
553 db_printf("idtr\t0x%016lx/%04x\n",
554 (u_long)idtr.base, (u_int)idtr.limit);
555 __asm __volatile("sgdt %0" : "=m" (gdtr));
556 db_printf("gdtr\t0x%016lx/%04x\n",
557 (u_long)gdtr.base, (u_int)gdtr.limit);
558 __asm __volatile("sldt %0" : "=r" (ldt));
559 db_printf("ldtr\t0x%04x\n", ldt);
560 __asm __volatile("str %0" : "=r" (tr));
561 db_printf("tr\t0x%04x\n", tr);
562 db_printf("cr0\t0x%016lx\n", rcr0());
563 db_printf("cr2\t0x%016lx\n", rcr2());
564 db_printf("cr3\t0x%016lx\n", rcr3());
565 db_printf("cr4\t0x%016lx\n", rcr4());
566 if (rcr4() & CR4_XSAVE)
567 db_printf("xcr0\t0x%016lx\n", rxcr(0));
568 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
569 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
570 db_printf("FEATURES_CTL\t%016lx\n",
571 rdmsr(MSR_IA32_FEATURE_CONTROL));
572 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
573 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
574 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
575 }
576
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
578 {
579
580 db_printf("dr0\t0x%016lx\n", rdr0());
581 db_printf("dr1\t0x%016lx\n", rdr1());
582 db_printf("dr2\t0x%016lx\n", rdr2());
583 db_printf("dr3\t0x%016lx\n", rdr3());
584 db_printf("dr6\t0x%016lx\n", rdr6());
585 db_printf("dr7\t0x%016lx\n", rdr7());
586 }
587 #endif
588
589 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
591 {
592
593 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
594 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
595 ssd->ssd_type = sd->sd_type;
596 ssd->ssd_dpl = sd->sd_dpl;
597 ssd->ssd_p = sd->sd_p;
598 ssd->ssd_long = sd->sd_long;
599 ssd->ssd_def32 = sd->sd_def32;
600 ssd->ssd_gran = sd->sd_gran;
601 }
602
603 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
605 {
606
607 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
608 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
609 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
610 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
611 sd->sd_type = ssd->ssd_type;
612 sd->sd_dpl = ssd->ssd_dpl;
613 sd->sd_p = ssd->ssd_p;
614 sd->sd_long = ssd->ssd_long;
615 sd->sd_def32 = ssd->ssd_def32;
616 sd->sd_gran = ssd->ssd_gran;
617 }
618
619 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
621 {
622
623 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
624 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
625 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
626 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
627 sd->sd_type = ssd->ssd_type;
628 sd->sd_dpl = ssd->ssd_dpl;
629 sd->sd_p = ssd->ssd_p;
630 sd->sd_gran = ssd->ssd_gran;
631 }
632
633 u_int basemem;
634
635 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
637 int *physmap_idxp)
638 {
639 int i, insert_idx, physmap_idx;
640
641 physmap_idx = *physmap_idxp;
642
643 if (length == 0)
644 return (1);
645
646 /*
647 * Find insertion point while checking for overlap. Start off by
648 * assuming the new entry will be added to the end.
649 *
650 * NB: physmap_idx points to the next free slot.
651 */
652 insert_idx = physmap_idx;
653 for (i = 0; i < physmap_idx; i += 2) {
654 if (base < physmap[i + 1]) {
655 if (base + length <= physmap[i]) {
656 insert_idx = i;
657 break;
658 }
659 if (boothowto & RB_VERBOSE)
660 printf(
661 "Overlapping memory regions, ignoring second region\n");
662 return (1);
663 }
664 }
665
666 /* See if we can prepend to the next entry. */
667 if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
668 physmap[insert_idx] = base;
669 return (1);
670 }
671
672 /* See if we can append to the previous entry. */
673 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
674 physmap[insert_idx - 1] += length;
675 return (1);
676 }
677
678 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
679 printf(
680 "Too many segments in the physical address map, giving up\n");
681 return (0);
682 }
683
684 /*
685 * Move the last 'N' entries down to make room for the new
686 * entry if needed.
687 */
688 for (i = physmap_idx; i > insert_idx; i -= 2) {
689 physmap[i] = physmap[i - 2];
690 physmap[i + 1] = physmap[i - 1];
691 }
692
693 physmap_idx += 2;
694 *physmap_idxp = physmap_idx;
695
696 /* Insert the new entry. */
697 physmap[insert_idx] = base;
698 physmap[insert_idx + 1] = base + length;
699 return (1);
700 }
701
702 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)703 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
704 vm_paddr_t *physmap, int *physmap_idx)
705 {
706 struct bios_smap *smap, *smapend;
707
708 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
709
710 for (smap = smapbase; smap < smapend; smap++) {
711 if (boothowto & RB_VERBOSE)
712 printf("SMAP type=%02x base=%016lx len=%016lx\n",
713 smap->type, smap->base, smap->length);
714
715 if (smap->type != SMAP_TYPE_MEMORY)
716 continue;
717
718 if (!add_physmap_entry(smap->base, smap->length, physmap,
719 physmap_idx))
720 break;
721 }
722 }
723
724 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)725 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
726 int *physmap_idx)
727 {
728 struct efi_md *map, *p;
729 const char *type;
730 size_t efisz;
731 int ndesc, i;
732
733 static const char *types[] = {
734 "Reserved",
735 "LoaderCode",
736 "LoaderData",
737 "BootServicesCode",
738 "BootServicesData",
739 "RuntimeServicesCode",
740 "RuntimeServicesData",
741 "ConventionalMemory",
742 "UnusableMemory",
743 "ACPIReclaimMemory",
744 "ACPIMemoryNVS",
745 "MemoryMappedIO",
746 "MemoryMappedIOPortSpace",
747 "PalCode",
748 "PersistentMemory"
749 };
750
751 /*
752 * Memory map data provided by UEFI via the GetMemoryMap
753 * Boot Services API.
754 */
755 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
756 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
757
758 if (efihdr->descriptor_size == 0)
759 return;
760 ndesc = efihdr->memory_size / efihdr->descriptor_size;
761
762 if (boothowto & RB_VERBOSE)
763 printf("%23s %12s %12s %8s %4s\n",
764 "Type", "Physical", "Virtual", "#Pages", "Attr");
765
766 TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
767 for (i = 0, p = map; i < ndesc; i++,
768 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
769 if (boothowto & RB_VERBOSE) {
770 if (p->md_type < nitems(types))
771 type = types[p->md_type];
772 else
773 type = "<INVALID>";
774 printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
775 p->md_virt, p->md_pages);
776 if (p->md_attr & EFI_MD_ATTR_UC)
777 printf("UC ");
778 if (p->md_attr & EFI_MD_ATTR_WC)
779 printf("WC ");
780 if (p->md_attr & EFI_MD_ATTR_WT)
781 printf("WT ");
782 if (p->md_attr & EFI_MD_ATTR_WB)
783 printf("WB ");
784 if (p->md_attr & EFI_MD_ATTR_UCE)
785 printf("UCE ");
786 if (p->md_attr & EFI_MD_ATTR_WP)
787 printf("WP ");
788 if (p->md_attr & EFI_MD_ATTR_RP)
789 printf("RP ");
790 if (p->md_attr & EFI_MD_ATTR_XP)
791 printf("XP ");
792 if (p->md_attr & EFI_MD_ATTR_NV)
793 printf("NV ");
794 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
795 printf("MORE_RELIABLE ");
796 if (p->md_attr & EFI_MD_ATTR_RO)
797 printf("RO ");
798 if (p->md_attr & EFI_MD_ATTR_RT)
799 printf("RUNTIME");
800 printf("\n");
801 }
802
803 switch (p->md_type) {
804 case EFI_MD_TYPE_BS_CODE:
805 case EFI_MD_TYPE_BS_DATA:
806 if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
807 continue;
808 /* FALLTHROUGH */
809 case EFI_MD_TYPE_CODE:
810 case EFI_MD_TYPE_DATA:
811 case EFI_MD_TYPE_FREE:
812 /*
813 * We're allowed to use any entry with these types.
814 */
815 break;
816 default:
817 continue;
818 }
819
820 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
821 physmap, physmap_idx))
822 break;
823 }
824 }
825
826 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)827 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
828 {
829 struct bios_smap *smap;
830 struct efi_map_header *efihdr;
831 u_int32_t size;
832
833 /*
834 * Memory map from INT 15:E820.
835 *
836 * subr_module.c says:
837 * "Consumer may safely assume that size value precedes data."
838 * ie: an int32_t immediately precedes smap.
839 */
840
841 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
842 MODINFO_METADATA | MODINFOMD_EFI_MAP);
843 smap = (struct bios_smap *)preload_search_info(preload_kmdp,
844 MODINFO_METADATA | MODINFOMD_SMAP);
845 if (efihdr == NULL && smap == NULL)
846 panic("No BIOS smap or EFI map info from loader!");
847
848 if (efihdr != NULL) {
849 add_efi_map_entries(efihdr, physmap, physmap_idx);
850 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
851 } else {
852 size = *((u_int32_t *)smap - 1);
853 bios_add_smap_entries(smap, size, physmap, physmap_idx);
854 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
855 }
856 }
857
858 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
859
860 /*
861 * Populate the (physmap) array with base/bound pairs describing the
862 * available physical memory in the system, then test this memory and
863 * build the phys_avail array describing the actually-available memory.
864 *
865 * Total memory size may be set by the kernel environment variable
866 * hw.physmem or the compile-time define MAXMEM.
867 *
868 * XXX first should be vm_paddr_t.
869 */
870 static void
getmemsize(u_int64_t first)871 getmemsize(u_int64_t first)
872 {
873 int i, physmap_idx, pa_indx, da_indx;
874 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
875 u_long physmem_start, physmem_tunable, memtest;
876 pt_entry_t *pte;
877 quad_t dcons_addr, dcons_size;
878 int page_counter;
879
880 TSENTER();
881 /*
882 * Tell the physical memory allocator about pages used to store
883 * the kernel and preloaded data. See kmem_bootstrap_free().
884 */
885 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
886
887 bzero(physmap, sizeof(physmap));
888 physmap_idx = 0;
889
890 init_ops.parse_memmap(physmap, &physmap_idx);
891 physmap_idx -= 2;
892
893 /*
894 * Find the 'base memory' segment for SMP
895 */
896 basemem = 0;
897 for (i = 0; i <= physmap_idx; i += 2) {
898 if (physmap[i] <= 0xA0000) {
899 basemem = physmap[i + 1] / 1024;
900 break;
901 }
902 }
903 if (basemem == 0 || basemem > 640) {
904 if (bootverbose)
905 printf(
906 "Memory map doesn't contain a basemem segment, faking it");
907 basemem = 640;
908 }
909
910 /*
911 * Maxmem isn't the "maximum memory", it's one larger than the
912 * highest page of the physical address space. It should be
913 * called something like "Maxphyspage". We may adjust this
914 * based on ``hw.physmem'' and the results of the memory test.
915 */
916 Maxmem = atop(physmap[physmap_idx + 1]);
917
918 #ifdef MAXMEM
919 Maxmem = MAXMEM / 4;
920 #endif
921
922 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
923 Maxmem = atop(physmem_tunable);
924
925 /*
926 * The boot memory test is disabled by default, as it takes a
927 * significant amount of time on large-memory systems, and is
928 * unfriendly to virtual machines as it unnecessarily touches all
929 * pages.
930 *
931 * A general name is used as the code may be extended to support
932 * additional tests beyond the current "page present" test.
933 */
934 memtest = 0;
935 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
936
937 /*
938 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
939 * in the system.
940 */
941 if (Maxmem > atop(physmap[physmap_idx + 1]))
942 Maxmem = atop(physmap[physmap_idx + 1]);
943
944 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
945 (boothowto & RB_VERBOSE))
946 printf("Physical memory use set to %ldK\n", Maxmem * 4);
947
948 /* call pmap initialization to make new kernel address space */
949 pmap_bootstrap(&first);
950
951 /*
952 * Size up each available chunk of physical memory.
953 *
954 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
955 * By default, mask off the first 16 pages unless we appear to be
956 * running in a VM.
957 */
958 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
959 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
960 if (physmap[0] < physmem_start) {
961 if (physmem_start < PAGE_SIZE)
962 physmap[0] = PAGE_SIZE;
963 else if (physmem_start >= physmap[1])
964 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
965 else
966 physmap[0] = round_page(physmem_start);
967 }
968 pa_indx = 0;
969 da_indx = 1;
970 phys_avail[pa_indx++] = physmap[0];
971 phys_avail[pa_indx] = physmap[0];
972 dump_avail[da_indx] = physmap[0];
973 pte = CMAP1;
974
975 /*
976 * Get dcons buffer address
977 */
978 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
979 getenv_quad("dcons.size", &dcons_size) == 0)
980 dcons_addr = 0;
981
982 /*
983 * physmap is in bytes, so when converting to page boundaries,
984 * round up the start address and round down the end address.
985 */
986 page_counter = 0;
987 if (memtest != 0)
988 printf("Testing system memory");
989 for (i = 0; i <= physmap_idx; i += 2) {
990 vm_paddr_t end;
991
992 end = ptoa((vm_paddr_t)Maxmem);
993 if (physmap[i + 1] < end)
994 end = trunc_page(physmap[i + 1]);
995 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
996 int *ptr = (int *)CADDR1;
997 int tmp;
998 bool full, page_bad;
999
1000 full = false;
1001 /*
1002 * block out kernel memory as not available.
1003 */
1004 if (pa >= (vm_paddr_t)kernphys && pa < first)
1005 goto do_dump_avail;
1006
1007 /*
1008 * block out dcons buffer
1009 */
1010 if (dcons_addr > 0
1011 && pa >= trunc_page(dcons_addr)
1012 && pa < dcons_addr + dcons_size)
1013 goto do_dump_avail;
1014
1015 page_bad = false;
1016 if (memtest == 0)
1017 goto skip_memtest;
1018
1019 /*
1020 * Print a "." every GB to show we're making
1021 * progress.
1022 */
1023 page_counter++;
1024 if ((page_counter % PAGES_PER_GB) == 0)
1025 printf(".");
1026
1027 /*
1028 * map page into kernel: valid, read/write,non-cacheable
1029 */
1030 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1031 invltlb();
1032
1033 tmp = *(int *)ptr;
1034 /*
1035 * Test for alternating 1's and 0's
1036 */
1037 *(volatile int *)ptr = 0xaaaaaaaa;
1038 if (*(volatile int *)ptr != 0xaaaaaaaa)
1039 page_bad = true;
1040 /*
1041 * Test for alternating 0's and 1's
1042 */
1043 *(volatile int *)ptr = 0x55555555;
1044 if (*(volatile int *)ptr != 0x55555555)
1045 page_bad = true;
1046 /*
1047 * Test for all 1's
1048 */
1049 *(volatile int *)ptr = 0xffffffff;
1050 if (*(volatile int *)ptr != 0xffffffff)
1051 page_bad = true;
1052 /*
1053 * Test for all 0's
1054 */
1055 *(volatile int *)ptr = 0x0;
1056 if (*(volatile int *)ptr != 0x0)
1057 page_bad = true;
1058 /*
1059 * Restore original value.
1060 */
1061 *(int *)ptr = tmp;
1062
1063 skip_memtest:
1064 /*
1065 * Adjust array of valid/good pages.
1066 */
1067 if (page_bad == true)
1068 continue;
1069 /*
1070 * If this good page is a continuation of the
1071 * previous set of good pages, then just increase
1072 * the end pointer. Otherwise start a new chunk.
1073 * Note that "end" points one higher than end,
1074 * making the range >= start and < end.
1075 * If we're also doing a speculative memory
1076 * test and we at or past the end, bump up Maxmem
1077 * so that we keep going. The first bad page
1078 * will terminate the loop.
1079 */
1080 if (phys_avail[pa_indx] == pa) {
1081 phys_avail[pa_indx] += PAGE_SIZE;
1082 } else {
1083 pa_indx++;
1084 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1085 printf(
1086 "Too many holes in the physical address space, giving up\n");
1087 pa_indx--;
1088 full = true;
1089 goto do_dump_avail;
1090 }
1091 phys_avail[pa_indx++] = pa; /* start */
1092 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1093 }
1094 physmem++;
1095 do_dump_avail:
1096 if (dump_avail[da_indx] == pa) {
1097 dump_avail[da_indx] += PAGE_SIZE;
1098 } else {
1099 da_indx++;
1100 if (da_indx == PHYS_AVAIL_ENTRIES) {
1101 da_indx--;
1102 goto do_next;
1103 }
1104 dump_avail[da_indx++] = pa; /* start */
1105 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1106 }
1107 do_next:
1108 if (full)
1109 break;
1110 }
1111 }
1112 *pte = 0;
1113 invltlb();
1114 if (memtest != 0)
1115 printf("\n");
1116
1117 /*
1118 * XXX
1119 * The last chunk must contain at least one page plus the message
1120 * buffer to avoid complicating other code (message buffer address
1121 * calculation, etc.).
1122 */
1123 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1124 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1125 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1126 phys_avail[pa_indx--] = 0;
1127 phys_avail[pa_indx--] = 0;
1128 }
1129
1130 Maxmem = atop(phys_avail[pa_indx]);
1131
1132 /* Trim off space for the message buffer. */
1133 phys_avail[pa_indx] -= round_page(msgbufsize);
1134
1135 /* Map the message buffer. */
1136 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1137 TSEXIT();
1138 }
1139
1140 static void
native_parse_preload_data(u_int64_t modulep)1141 native_parse_preload_data(u_int64_t modulep)
1142 {
1143 char *envp;
1144 #ifdef DDB
1145 vm_offset_t ksym_start;
1146 vm_offset_t ksym_end;
1147 #endif
1148
1149 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1150 preload_bootstrap_relocate(KERNBASE);
1151 preload_initkmdp(true);
1152 boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1153 envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1154 if (envp != NULL)
1155 envp += KERNBASE;
1156 init_static_kenv(envp, 0);
1157 #ifdef DDB
1158 ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1159 ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1160 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1161 #endif
1162 efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1163 vm_paddr_t);
1164 }
1165
1166 static void
native_clock_source_init(void)1167 native_clock_source_init(void)
1168 {
1169 i8254_init();
1170 }
1171
1172 static void
amd64_kdb_init(void)1173 amd64_kdb_init(void)
1174 {
1175 kdb_init();
1176 #ifdef KDB
1177 if (boothowto & RB_KDB)
1178 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1179 #endif
1180 }
1181
1182 /* Set up the fast syscall stuff */
1183 void
amd64_conf_fast_syscall(void)1184 amd64_conf_fast_syscall(void)
1185 {
1186 uint64_t msr;
1187
1188 msr = rdmsr(MSR_EFER) | EFER_SCE;
1189 wrmsr(MSR_EFER, msr);
1190 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1191 (u_int64_t)IDTVEC(fast_syscall));
1192 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1193 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1194 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1195 wrmsr(MSR_STAR, msr);
1196 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1197 }
1198
1199 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1200 amd64_bsp_pcpu_init1(struct pcpu *pc)
1201 {
1202 struct user_segment_descriptor *gdt;
1203
1204 PCPU_SET(prvspace, pc);
1205 gdt = *PCPU_PTR(gdt);
1206 PCPU_SET(curthread, &thread0);
1207 PCPU_SET(tssp, PCPU_PTR(common_tss));
1208 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1209 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1210 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1211 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1212 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1213 PCPU_SET(smp_tlb_gen, 1);
1214 }
1215
1216 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1217 amd64_bsp_pcpu_init2(uint64_t rsp0)
1218 {
1219
1220 PCPU_SET(rsp0, rsp0);
1221 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1222 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1223 PCPU_SET(curpcb, thread0.td_pcb);
1224 }
1225
1226 void
amd64_bsp_ist_init(struct pcpu * pc)1227 amd64_bsp_ist_init(struct pcpu *pc)
1228 {
1229 struct nmi_pcpu *np;
1230 struct amd64tss *tssp;
1231
1232 tssp = &pc->pc_common_tss;
1233
1234 /* doublefault stack space, runs on ist1 */
1235 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1236 np->np_pcpu = (register_t)pc;
1237 tssp->tss_ist1 = (long)np;
1238
1239 /*
1240 * NMI stack, runs on ist2. The pcpu pointer is stored just
1241 * above the start of the ist2 stack.
1242 */
1243 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1244 np->np_pcpu = (register_t)pc;
1245 tssp->tss_ist2 = (long)np;
1246
1247 /*
1248 * MC# stack, runs on ist3. The pcpu pointer is stored just
1249 * above the start of the ist3 stack.
1250 */
1251 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1252 np->np_pcpu = (register_t)pc;
1253 tssp->tss_ist3 = (long)np;
1254
1255 /*
1256 * DB# stack, runs on ist4.
1257 */
1258 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1259 np->np_pcpu = (register_t)pc;
1260 tssp->tss_ist4 = (long)np;
1261 }
1262
1263 /*
1264 * Calculate the kernel load address by inspecting page table created by loader.
1265 * The assumptions:
1266 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1267 * aligned at 2M, below 4G (the latter is important for AP startup)
1268 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1269 * - kernel is mapped with 2M superpages
1270 * - all participating memory, i.e. kernel, modules, metadata,
1271 * page table is accessible by pre-created 1:1 mapping
1272 * (right now loader creates 1:1 mapping for lower 4G, and all
1273 * memory is from there)
1274 * - there is a usable memory block right after the end of the
1275 * mapped kernel and all modules/metadata, pointed to by
1276 * physfree, for early allocations
1277 */
1278 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1279 amd64_loadaddr(void)
1280 {
1281 pml4_entry_t *pml4e;
1282 pdp_entry_t *pdpe;
1283 pd_entry_t *pde;
1284 uint64_t cr3;
1285
1286 cr3 = rcr3();
1287 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1288 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1289 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1290 return (*pde & PG_FRAME);
1291 }
1292
1293 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1294 hammer_time(u_int64_t modulep, u_int64_t physfree)
1295 {
1296 int gsel_tss, x;
1297 struct pcpu *pc;
1298 uint64_t rsp0;
1299 char *env;
1300 struct user_segment_descriptor *gdt;
1301 struct region_descriptor r_gdt;
1302 size_t kstack0_sz;
1303
1304 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1305
1306 kernphys = amd64_loadaddr();
1307
1308 physfree += kernphys;
1309
1310 /* Initializes preload_kmdp */
1311 init_ops.parse_preload_data(modulep);
1312
1313 efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1314 MODINFOMD_EFI_MAP) != NULL;
1315
1316 if (!efi_boot) {
1317 /* Tell the bios to warmboot next time */
1318 atomic_store_short((u_short *)0x472, 0x1234);
1319 }
1320
1321 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1322 physfree = roundup2(physfree, PAGE_SIZE);
1323
1324 identify_cpu1();
1325 identify_hypervisor();
1326 identify_hypervisor_smbios();
1327 identify_cpu_fixup_bsp();
1328 identify_cpu2();
1329 initializecpucache();
1330
1331 /*
1332 * Check for pti, pcid, and invpcid before ifuncs are
1333 * resolved, to correctly select the implementation for
1334 * pmap_activate_sw_mode().
1335 */
1336 pti = pti_get_default();
1337 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1338 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1339 if ((cpu_feature2 & CPUID2_PCID) == 0)
1340 pmap_pcid_enabled = 0;
1341 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0;
1342
1343 /*
1344 * Now we can do small core initialization, after the PCID
1345 * CPU features and user knobs are evaluated.
1346 */
1347 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1348 &pmap_pcid_invlpg_workaround_uena);
1349 cpu_init_small_core();
1350
1351 if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1352 use_xsave = 1;
1353 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1354 }
1355
1356 link_elf_ireloc();
1357
1358 /*
1359 * This may be done better later if it gets more high level
1360 * components in it. If so just link td->td_proc here.
1361 */
1362 proc_linkup0(&proc0, &thread0);
1363
1364 /* Init basic tunables, hz etc */
1365 init_param1();
1366
1367 thread0.td_kstack = physfree - kernphys + KERNSTART;
1368 thread0.td_kstack_pages = kstack_pages;
1369 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1370 bzero((void *)thread0.td_kstack, kstack0_sz);
1371 physfree += kstack0_sz;
1372
1373 /*
1374 * Initialize enough of thread0 for delayed invalidation to
1375 * work very early. Rely on thread0.td_base_pri
1376 * zero-initialization, it is reset to PVM at proc0_init().
1377 */
1378 pmap_thread_init_invl_gen(&thread0);
1379
1380 pc = &temp_bsp_pcpu;
1381 pcpu_init(pc, 0, sizeof(struct pcpu));
1382 gdt = &temp_bsp_pcpu.pc_gdt[0];
1383
1384 /*
1385 * make gdt memory segments
1386 */
1387 for (x = 0; x < NGDT; x++) {
1388 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1389 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1390 ssdtosd(&gdt_segs[x], &gdt[x]);
1391 }
1392 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1393 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1394 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1395
1396 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1397 r_gdt.rd_base = (long)gdt;
1398 lgdt(&r_gdt);
1399
1400 wrmsr(MSR_FSBASE, 0); /* User value */
1401 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1402 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1403
1404 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1405 physfree += DPCPU_SIZE;
1406 amd64_bsp_pcpu_init1(pc);
1407 /* Non-late cninit() and printf() can be moved up to here. */
1408
1409 /*
1410 * Initialize mutexes.
1411 *
1412 * icu_lock: in order to allow an interrupt to occur in a critical
1413 * section, to set pcpu->ipending (etc...) properly, we
1414 * must be able to get the icu lock, so it can't be
1415 * under witness.
1416 */
1417 mutex_init();
1418 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1419 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1420
1421 /* exceptions */
1422 for (x = 0; x < NIDT; x++)
1423 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1424 SEL_KPL, 0);
1425 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1426 SEL_KPL, 0);
1427 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1428 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1429 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1430 SEL_UPL, 0);
1431 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1432 SEL_UPL, 0);
1433 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1434 SEL_KPL, 0);
1435 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1436 SEL_KPL, 0);
1437 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1438 SEL_KPL, 0);
1439 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1440 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1441 SDT_SYSIGT, SEL_KPL, 0);
1442 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1443 SEL_KPL, 0);
1444 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1445 SDT_SYSIGT, SEL_KPL, 0);
1446 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1447 SEL_KPL, 0);
1448 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1449 SEL_KPL, 0);
1450 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1451 SEL_KPL, 0);
1452 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1453 SEL_KPL, 0);
1454 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1455 SEL_KPL, 0);
1456 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1457 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1458 SEL_KPL, 0);
1459 #ifdef KDTRACE_HOOKS
1460 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1461 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1462 #endif
1463 #ifdef XENHVM
1464 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1465 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1466 #endif
1467 r_idt.rd_limit = sizeof(idt0) - 1;
1468 r_idt.rd_base = (long) idt;
1469 lidt(&r_idt);
1470
1471 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1472 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1473
1474 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1475 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1476
1477 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1478 &syscall_ret_l1d_flush_mode);
1479
1480 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1481 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1482
1483 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1484
1485 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1486 &x86_rngds_mitg_enable);
1487
1488 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1489 &zenbleed_enable);
1490 zenbleed_sanitize_enable();
1491
1492 finishidentcpu(); /* Final stage of CPU initialization */
1493
1494 invlpgb_works = (amd_extended_feature_extensions &
1495 AMDFEID_INVLPGB) != 0;
1496 TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1497 if (invlpgb_works)
1498 invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1499
1500 /*
1501 * Initialize the clock before the console so that console
1502 * initialization can use DELAY().
1503 */
1504 clock_init();
1505
1506 initializecpu(); /* Initialize CPU registers */
1507
1508 amd64_bsp_ist_init(pc);
1509
1510 /* Set the IO permission bitmap (empty due to tss seg limit) */
1511 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1512 IOPERM_BITMAP_SIZE;
1513
1514 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1515 ltr(gsel_tss);
1516
1517 amd64_conf_fast_syscall();
1518
1519 /*
1520 * We initialize the PCB pointer early so that exception
1521 * handlers will work. Also set up td_critnest to short-cut
1522 * the page fault handler.
1523 */
1524 cpu_max_ext_state_size = sizeof(struct savefpu);
1525 set_top_of_stack_td(&thread0);
1526 thread0.td_pcb = get_pcb_td(&thread0);
1527 thread0.td_critnest = 1;
1528
1529 /*
1530 * The console and kdb should be initialized even earlier than here,
1531 * but some console drivers don't work until after getmemsize().
1532 * Default to late console initialization to support these drivers.
1533 * This loses mainly printf()s in getmemsize() and early debugging.
1534 */
1535 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1536 if (!late_console) {
1537 cninit();
1538 amd64_kdb_init();
1539 }
1540
1541 getmemsize(physfree);
1542 init_param2(physmem);
1543
1544 /* now running on new page tables, configured,and u/iom is accessible */
1545
1546 #ifdef DEV_PCI
1547 /* This call might adjust phys_avail[]. */
1548 pci_early_quirks();
1549 #endif
1550
1551 if (late_console)
1552 cninit();
1553
1554 /*
1555 * Dump the boot metadata. We have to wait for cninit() since console
1556 * output is required. If it's grossly incorrect the kernel will never
1557 * make it this far.
1558 */
1559 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1560 preload_dump();
1561
1562 #ifdef DEV_ISA
1563 #ifdef DEV_ATPIC
1564 elcr_probe();
1565 atpic_startup();
1566 #else
1567 /* Reset and mask the atpics and leave them shut down. */
1568 atpic_reset();
1569
1570 /*
1571 * Point the ICU spurious interrupt vectors at the APIC spurious
1572 * interrupt handler.
1573 */
1574 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1575 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1576 #endif
1577 #else
1578 #error "have you forgotten the isa device?"
1579 #endif
1580
1581 if (late_console)
1582 amd64_kdb_init();
1583
1584 msgbufinit(msgbufp, msgbufsize);
1585 fpuinit();
1586
1587 /* make an initial tss so cpu can get interrupt stack on syscall! */
1588 rsp0 = thread0.td_md.md_stack_base;
1589 /* Ensure the stack is aligned to 16 bytes */
1590 rsp0 &= ~0xFul;
1591 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1592 amd64_bsp_pcpu_init2(rsp0);
1593
1594 /* transfer to user mode */
1595
1596 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1597 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1598 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1599 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1600 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1601
1602 load_ds(_udatasel);
1603 load_es(_udatasel);
1604 load_fs(_ufssel);
1605
1606 /* setup proc 0's pcb */
1607 thread0.td_pcb->pcb_flags = 0;
1608
1609 env = kern_getenv("kernelname");
1610 if (env != NULL)
1611 strlcpy(kernelname, env, sizeof(kernelname));
1612
1613 kcsan_cpu_init(0);
1614
1615 #ifdef FDT
1616 x86_init_fdt();
1617 #endif
1618 thread0.td_critnest = 0;
1619
1620 kasan_init();
1621 kmsan_init();
1622
1623 TSEXIT();
1624
1625 /* Location of kernel stack for locore */
1626 return (thread0.td_md.md_stack_base);
1627 }
1628
1629 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1630 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1631 {
1632
1633 pcpu->pc_acpi_id = 0xffffffff;
1634 }
1635
1636 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1637 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1638 {
1639 struct bios_smap *smapbase;
1640 struct bios_smap_xattr smap;
1641 uint32_t *smapattr;
1642 int count, error, i;
1643
1644 /* Retrieve the system memory map from the loader. */
1645 smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1646 MODINFO_METADATA | MODINFOMD_SMAP);
1647 if (smapbase == NULL)
1648 return (0);
1649 smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1650 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1651 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1652 error = 0;
1653 for (i = 0; i < count; i++) {
1654 smap.base = smapbase[i].base;
1655 smap.length = smapbase[i].length;
1656 smap.type = smapbase[i].type;
1657 if (smapattr != NULL)
1658 smap.xattr = smapattr[i];
1659 else
1660 smap.xattr = 0;
1661 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1662 }
1663 return (error);
1664 }
1665 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1666 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1667 smap_sysctl_handler, "S,bios_smap_xattr",
1668 "Raw BIOS SMAP data");
1669
1670 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1671 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1672 {
1673 struct efi_map_header *efihdr;
1674 uint32_t efisize;
1675
1676 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1677 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1678 if (efihdr == NULL)
1679 return (0);
1680 efisize = *((uint32_t *)efihdr - 1);
1681 return (SYSCTL_OUT(req, efihdr, efisize));
1682 }
1683 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1684 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1685 efi_map_sysctl_handler, "S,efi_map_header",
1686 "Raw EFI Memory Map");
1687
1688 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1689 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1690 {
1691 char *arch;
1692
1693 arch = (char *)preload_search_info(preload_kmdp,
1694 MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1695 if (arch == NULL)
1696 return (0);
1697
1698 return (SYSCTL_OUT_STR(req, arch));
1699 }
1700 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1701 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1702 efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1703
1704 void
spinlock_enter(void)1705 spinlock_enter(void)
1706 {
1707 struct thread *td;
1708 register_t flags;
1709
1710 td = curthread;
1711 if (td->td_md.md_spinlock_count == 0) {
1712 flags = intr_disable();
1713 td->td_md.md_spinlock_count = 1;
1714 td->td_md.md_saved_flags = flags;
1715 critical_enter();
1716 } else
1717 td->td_md.md_spinlock_count++;
1718 }
1719
1720 void
spinlock_exit(void)1721 spinlock_exit(void)
1722 {
1723 struct thread *td;
1724 register_t flags;
1725
1726 td = curthread;
1727 flags = td->td_md.md_saved_flags;
1728 td->td_md.md_spinlock_count--;
1729 if (td->td_md.md_spinlock_count == 0) {
1730 critical_exit();
1731 intr_restore(flags);
1732 }
1733 }
1734
1735 /*
1736 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1737 * we want to start a backtrace from the function that caused us to enter
1738 * the debugger. We have the context in the trapframe, but base the trace
1739 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1740 * enough for a backtrace.
1741 */
1742 void
makectx(struct trapframe * tf,struct pcb * pcb)1743 makectx(struct trapframe *tf, struct pcb *pcb)
1744 {
1745
1746 pcb->pcb_r12 = tf->tf_r12;
1747 pcb->pcb_r13 = tf->tf_r13;
1748 pcb->pcb_r14 = tf->tf_r14;
1749 pcb->pcb_r15 = tf->tf_r15;
1750 pcb->pcb_rbp = tf->tf_rbp;
1751 pcb->pcb_rbx = tf->tf_rbx;
1752 pcb->pcb_rip = tf->tf_rip;
1753 pcb->pcb_rsp = tf->tf_rsp;
1754 }
1755
1756 /*
1757 * The pcb_flags is only modified by current thread, or by other threads
1758 * when current thread is stopped. However, current thread may change it
1759 * from the interrupt context in cpu_switch(), or in the trap handler.
1760 * When we read-modify-write pcb_flags from C sources, compiler may generate
1761 * code that is not atomic regarding the interrupt handler. If a trap or
1762 * interrupt happens and any flag is modified from the handler, it can be
1763 * clobbered with the cached value later. Therefore, we implement setting
1764 * and clearing flags with single-instruction functions, which do not race
1765 * with possible modification of the flags from the trap or interrupt context,
1766 * because traps and interrupts are executed only on instruction boundary.
1767 */
1768 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1769 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1770 {
1771
1772 __asm __volatile("orl %1,%0"
1773 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1774 : "cc", "memory");
1775
1776 }
1777
1778 /*
1779 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1780 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1781 * pcb if user space modified the bases. We must save on the context
1782 * switch or if the return to usermode happens through the doreti.
1783 *
1784 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1785 * which have a consequence that the base MSRs must be saved each time
1786 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
1787 * context switches.
1788 */
1789 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1790 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1791 {
1792 register_t r;
1793
1794 if (curpcb == pcb &&
1795 (flags & PCB_FULL_IRET) != 0 &&
1796 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1797 r = intr_disable();
1798 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1799 pcb->pcb_fsbase = rdfsbase();
1800 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1801 }
1802 set_pcb_flags_raw(pcb, flags);
1803 intr_restore(r);
1804 } else {
1805 set_pcb_flags_raw(pcb, flags);
1806 }
1807 }
1808
1809 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1810 {
1811
1812 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1813 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1814 }
1815
1816 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1817 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1818 {
1819
1820 __asm __volatile("andl %1,%0"
1821 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1822 : "cc", "memory");
1823 }
1824
1825 #ifdef KDB
1826
1827 /*
1828 * Provide inb() and outb() as functions. They are normally only available as
1829 * inline functions, thus cannot be called from the debugger.
1830 */
1831
1832 /* silence compiler warnings */
1833 u_char inb_(u_short);
1834 void outb_(u_short, u_char);
1835
1836 u_char
inb_(u_short port)1837 inb_(u_short port)
1838 {
1839 return inb(port);
1840 }
1841
1842 void
outb_(u_short port,u_char data)1843 outb_(u_short port, u_char data)
1844 {
1845 outb(port, data);
1846 }
1847
1848 #endif /* KDB */
1849
1850 #undef memset
1851 #undef memmove
1852 #undef memcpy
1853
1854 void *memset_std(void *buf, int c, size_t len);
1855 void *memset_erms(void *buf, int c, size_t len);
1856 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1857 size_t len);
1858 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1859 size_t len);
1860 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1861 size_t len);
1862 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1863 size_t len);
1864
1865 #ifdef KCSAN
1866 /*
1867 * These fail to build as ifuncs when used with KCSAN.
1868 */
1869 void *
memset(void * buf,int c,size_t len)1870 memset(void *buf, int c, size_t len)
1871 {
1872
1873 return (memset_std(buf, c, len));
1874 }
1875
1876 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1877 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1878 {
1879
1880 return (memmove_std(dst, src, len));
1881 }
1882
1883 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1884 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1885 {
1886
1887 return (memcpy_std(dst, src, len));
1888 }
1889 #else
1890 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1891 {
1892
1893 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1894 memset_erms : memset_std);
1895 }
1896
1897 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1898 size_t))
1899 {
1900
1901 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1902 memmove_erms : memmove_std);
1903 }
1904
1905 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1906 {
1907
1908 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1909 memcpy_erms : memcpy_std);
1910 }
1911 #endif
1912
1913 void pagezero_std(void *addr);
1914 void pagezero_erms(void *addr);
1915 DEFINE_IFUNC(, void , pagezero, (void *))
1916 {
1917
1918 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1919 pagezero_erms : pagezero_std);
1920 }
1921