xref: /freebsd/sys/amd64/amd64/machdep.c (revision e05999a8c5a9bfb2f5c91fbc1dc397d46eed5bbb)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52 
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <dev/smbios/smbios.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167 
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170 
171 /* Preload data parse function */
172 static void native_parse_preload_data(u_int64_t);
173 
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(vm_paddr_t *, int *);
176 
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 	.parse_preload_data =		native_parse_preload_data,
180 	.early_clock_source_init =	native_clock_source_init,
181 	.early_delay =			i8254_delay,
182 	.parse_memmap =			native_parse_memmap,
183 };
184 
185 /*
186  * Physical address of the EFI System Table. Stashed from the metadata hints
187  * passed into the kernel and used by the EFI code to call runtime services.
188  */
189 vm_paddr_t efi_systbl_phys;
190 
191 /*
192  * Bitmap of extra EFI memory region types that should be preserved and mapped
193  * during runtime services calls.
194  */
195 uint32_t efi_map_regs;
196 
197 /* Intel ICH registers */
198 #define ICH_PMBASE	0x400
199 #define ICH_SMI_EN	ICH_PMBASE + 0x30
200 
201 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
202 
203 int cold = 1;
204 
205 long Maxmem = 0;
206 long realmem = 0;
207 int late_console = 1;
208 
209 struct kva_md_info kmi;
210 
211 struct region_descriptor r_idt;
212 
213 struct pcpu *__pcpu;
214 struct pcpu temp_bsp_pcpu;
215 
216 struct mtx icu_lock;
217 
218 struct mem_range_softc mem_range_softc;
219 
220 struct mtx dt_lock;	/* lock for GDT and LDT */
221 
222 void (*vmm_suspend_p)(void);
223 void (*vmm_resume_p)(void);
224 
225 bool efi_boot;
226 
227 static void
cpu_startup(void * dummy)228 cpu_startup(void *dummy)
229 {
230 	uintmax_t memsize;
231 	char *sysenv;
232 
233 	/*
234 	 * On MacBooks, we need to disallow the legacy USB circuit to
235 	 * generate an SMI# because this can cause several problems,
236 	 * namely: incorrect CPU frequency detection and failure to
237 	 * start the APs.
238 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
239 	 * Enable register) of the Intel ICH LPC Interface Bridge.
240 	 */
241 	sysenv = kern_getenv("smbios.system.product");
242 	if (sysenv != NULL) {
243 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
244 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
245 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
246 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
247 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
248 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
249 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
250 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
251 			if (bootverbose)
252 				printf("Disabling LEGACY_USB_EN bit on "
253 				    "Intel ICH.\n");
254 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
255 		}
256 		freeenv(sysenv);
257 	}
258 
259 	/*
260 	 * Good {morning,afternoon,evening,night}.
261 	 */
262 	startrtclock();
263 	printcpuinfo();
264 
265 	/*
266 	 * Display physical memory if SMBIOS reports reasonable amount.
267 	 */
268 	memsize = 0;
269 	sysenv = kern_getenv("smbios.memory.enabled");
270 	if (sysenv != NULL) {
271 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
272 		freeenv(sysenv);
273 	}
274 	if (memsize < ptoa((uintmax_t)vm_free_count()))
275 		memsize = ptoa((uintmax_t)Maxmem);
276 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
277 	realmem = atop(memsize);
278 
279 	/*
280 	 * Display any holes after the first chunk of extended memory.
281 	 */
282 	if (bootverbose) {
283 		int indx;
284 
285 		printf("Physical memory chunk(s):\n");
286 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
287 			vm_paddr_t size;
288 
289 			size = phys_avail[indx + 1] - phys_avail[indx];
290 			printf(
291 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
292 			    (uintmax_t)phys_avail[indx],
293 			    (uintmax_t)phys_avail[indx + 1] - 1,
294 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
295 		}
296 	}
297 
298 	vm_ksubmap_init(&kmi);
299 
300 	printf("avail memory = %ju (%ju MB)\n",
301 	    ptoa((uintmax_t)vm_free_count()),
302 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
303 #ifdef DEV_PCI
304 	if (bootverbose && intel_graphics_stolen_base != 0)
305 		printf("intel stolen mem: base %#jx size %ju MB\n",
306 		    (uintmax_t)intel_graphics_stolen_base,
307 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
308 #endif
309 
310 	/*
311 	 * Set up buffers, so they can be used to read disk labels.
312 	 */
313 	bufinit();
314 	vm_pager_bufferinit();
315 
316 	cpu_setregs();
317 }
318 
319 static void
late_ifunc_resolve(void * dummy __unused)320 late_ifunc_resolve(void *dummy __unused)
321 {
322 	link_elf_late_ireloc();
323 }
324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
325 
326 
327 void
cpu_setregs(void)328 cpu_setregs(void)
329 {
330 	register_t cr0;
331 
332 	TSENTER();
333 	cr0 = rcr0();
334 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
335 	TSENTER2("load_cr0");
336 	load_cr0(cr0);
337 	TSEXIT2("load_cr0");
338 	TSEXIT();
339 }
340 
341 /*
342  * Initialize amd64 and configure to run kernel
343  */
344 
345 /*
346  * Initialize segments & interrupt table
347  */
348 static struct gate_descriptor idt0[NIDT];
349 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
350 
351 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
352 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
353 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
354 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
355 CTASSERT(sizeof(struct nmi_pcpu) == 16);
356 
357 /*
358  * Software prototypes -- in more palatable form.
359  *
360  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
361  * slots as corresponding segments for i386 kernel.
362  */
363 struct soft_segment_descriptor gdt_segs[] = {
364 [GNULL_SEL] = { /* 0 Null Descriptor */
365 	.ssd_base = 0x0,
366 	.ssd_limit = 0x0,
367 	.ssd_type = 0,
368 	.ssd_dpl = 0,
369 	.ssd_p = 0,
370 	.ssd_long = 0,
371 	.ssd_def32 = 0,
372 	.ssd_gran = 0		},
373 [GNULL2_SEL] = { /*	1 Null Descriptor */
374 	.ssd_base = 0x0,
375 	.ssd_limit = 0x0,
376 	.ssd_type = 0,
377 	.ssd_dpl = 0,
378 	.ssd_p = 0,
379 	.ssd_long = 0,
380 	.ssd_def32 = 0,
381 	.ssd_gran = 0		},
382 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
383 	.ssd_base = 0x0,
384 	.ssd_limit = 0xfffff,
385 	.ssd_type = SDT_MEMRWA,
386 	.ssd_dpl = SEL_UPL,
387 	.ssd_p = 1,
388 	.ssd_long = 0,
389 	.ssd_def32 = 1,
390 	.ssd_gran = 1		},
391 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
392 	.ssd_base = 0x0,
393 	.ssd_limit = 0xfffff,
394 	.ssd_type = SDT_MEMRWA,
395 	.ssd_dpl = SEL_UPL,
396 	.ssd_p = 1,
397 	.ssd_long = 0,
398 	.ssd_def32 = 1,
399 	.ssd_gran = 1		},
400 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
401 	.ssd_base = 0x0,
402 	.ssd_limit = 0xfffff,
403 	.ssd_type = SDT_MEMERA,
404 	.ssd_dpl = SEL_KPL,
405 	.ssd_p = 1,
406 	.ssd_long = 1,
407 	.ssd_def32 = 0,
408 	.ssd_gran = 1		},
409 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
410 	.ssd_base = 0x0,
411 	.ssd_limit = 0xfffff,
412 	.ssd_type = SDT_MEMRWA,
413 	.ssd_dpl = SEL_KPL,
414 	.ssd_p = 1,
415 	.ssd_long = 1,
416 	.ssd_def32 = 0,
417 	.ssd_gran = 1		},
418 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
419 	.ssd_base = 0x0,
420 	.ssd_limit = 0xfffff,
421 	.ssd_type = SDT_MEMERA,
422 	.ssd_dpl = SEL_UPL,
423 	.ssd_p = 1,
424 	.ssd_long = 0,
425 	.ssd_def32 = 1,
426 	.ssd_gran = 1		},
427 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
428 	.ssd_base = 0x0,
429 	.ssd_limit = 0xfffff,
430 	.ssd_type = SDT_MEMRWA,
431 	.ssd_dpl = SEL_UPL,
432 	.ssd_p = 1,
433 	.ssd_long = 0,
434 	.ssd_def32 = 1,
435 	.ssd_gran = 1		},
436 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
437 	.ssd_base = 0x0,
438 	.ssd_limit = 0xfffff,
439 	.ssd_type = SDT_MEMERA,
440 	.ssd_dpl = SEL_UPL,
441 	.ssd_p = 1,
442 	.ssd_long = 1,
443 	.ssd_def32 = 0,
444 	.ssd_gran = 1		},
445 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
446 	.ssd_base = 0x0,
447 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
448 	.ssd_type = SDT_SYSTSS,
449 	.ssd_dpl = SEL_KPL,
450 	.ssd_p = 1,
451 	.ssd_long = 0,
452 	.ssd_def32 = 0,
453 	.ssd_gran = 0		},
454 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
455 	.ssd_base = 0x0,
456 	.ssd_limit = 0x0,
457 	.ssd_type = 0,
458 	.ssd_dpl = 0,
459 	.ssd_p = 0,
460 	.ssd_long = 0,
461 	.ssd_def32 = 0,
462 	.ssd_gran = 0		},
463 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
464 	.ssd_base = 0x0,
465 	.ssd_limit = 0x0,
466 	.ssd_type = 0,
467 	.ssd_dpl = 0,
468 	.ssd_p = 0,
469 	.ssd_long = 0,
470 	.ssd_def32 = 0,
471 	.ssd_gran = 0		},
472 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
473 	.ssd_base = 0x0,
474 	.ssd_limit = 0x0,
475 	.ssd_type = 0,
476 	.ssd_dpl = 0,
477 	.ssd_p = 0,
478 	.ssd_long = 0,
479 	.ssd_def32 = 0,
480 	.ssd_gran = 0		},
481 };
482 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
483 
484 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)485 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
486 {
487 	struct gate_descriptor *ip;
488 
489 	ip = idt + idx;
490 	ip->gd_looffset = (uintptr_t)func;
491 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
492 	ip->gd_ist = ist;
493 	ip->gd_xx = 0;
494 	ip->gd_type = typ;
495 	ip->gd_dpl = dpl;
496 	ip->gd_p = 1;
497 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
498 }
499 
500 extern inthand_t
501 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
502 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
503 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
504 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
505 	IDTVEC(xmm), IDTVEC(dblfault),
506 	IDTVEC(div_pti), IDTVEC(bpt_pti),
507 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
508 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
509 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
510 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
511 	IDTVEC(xmm_pti),
512 #ifdef KDTRACE_HOOKS
513 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
514 #endif
515 #ifdef XENHVM
516 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
517 #endif
518 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
519 	IDTVEC(fast_syscall_pti);
520 
521 #ifdef DDB
522 /*
523  * Display the index and function name of any IDT entries that don't use
524  * the default 'rsvd' entry point.
525  */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)526 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
527 {
528 	struct gate_descriptor *ip;
529 	int idx;
530 	uintptr_t func;
531 
532 	ip = idt;
533 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
534 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
535 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
536 			db_printf("%3d\t", idx);
537 			db_printsym(func, DB_STGY_PROC);
538 			db_printf("\n");
539 		}
540 		ip++;
541 	}
542 }
543 
544 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)545 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
546 {
547 	struct {
548 		uint16_t limit;
549 		uint64_t base;
550 	} __packed idtr, gdtr;
551 	uint16_t ldt, tr;
552 
553 	__asm __volatile("sidt %0" : "=m" (idtr));
554 	db_printf("idtr\t0x%016lx/%04x\n",
555 	    (u_long)idtr.base, (u_int)idtr.limit);
556 	__asm __volatile("sgdt %0" : "=m" (gdtr));
557 	db_printf("gdtr\t0x%016lx/%04x\n",
558 	    (u_long)gdtr.base, (u_int)gdtr.limit);
559 	__asm __volatile("sldt %0" : "=r" (ldt));
560 	db_printf("ldtr\t0x%04x\n", ldt);
561 	__asm __volatile("str %0" : "=r" (tr));
562 	db_printf("tr\t0x%04x\n", tr);
563 	db_printf("cr0\t0x%016lx\n", rcr0());
564 	db_printf("cr2\t0x%016lx\n", rcr2());
565 	db_printf("cr3\t0x%016lx\n", rcr3());
566 	db_printf("cr4\t0x%016lx\n", rcr4());
567 	if (rcr4() & CR4_XSAVE)
568 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
569 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
570 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
571 		db_printf("FEATURES_CTL\t%016lx\n",
572 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
573 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
574 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
575 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
576 }
577 
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)578 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
579 {
580 
581 	db_printf("dr0\t0x%016lx\n", rdr0());
582 	db_printf("dr1\t0x%016lx\n", rdr1());
583 	db_printf("dr2\t0x%016lx\n", rdr2());
584 	db_printf("dr3\t0x%016lx\n", rdr3());
585 	db_printf("dr6\t0x%016lx\n", rdr6());
586 	db_printf("dr7\t0x%016lx\n", rdr7());
587 }
588 #endif
589 
590 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)591 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
592 {
593 
594 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
595 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
596 	ssd->ssd_type  = sd->sd_type;
597 	ssd->ssd_dpl   = sd->sd_dpl;
598 	ssd->ssd_p     = sd->sd_p;
599 	ssd->ssd_long  = sd->sd_long;
600 	ssd->ssd_def32 = sd->sd_def32;
601 	ssd->ssd_gran  = sd->sd_gran;
602 }
603 
604 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)605 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
606 {
607 
608 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
609 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
610 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
611 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
612 	sd->sd_type  = ssd->ssd_type;
613 	sd->sd_dpl   = ssd->ssd_dpl;
614 	sd->sd_p     = ssd->ssd_p;
615 	sd->sd_long  = ssd->ssd_long;
616 	sd->sd_def32 = ssd->ssd_def32;
617 	sd->sd_gran  = ssd->ssd_gran;
618 }
619 
620 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)621 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
622 {
623 
624 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
625 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
626 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
627 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
628 	sd->sd_type  = ssd->ssd_type;
629 	sd->sd_dpl   = ssd->ssd_dpl;
630 	sd->sd_p     = ssd->ssd_p;
631 	sd->sd_gran  = ssd->ssd_gran;
632 }
633 
634 u_int basemem;
635 
636 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)637 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
638     int *physmap_idxp)
639 {
640 	int i, insert_idx, physmap_idx;
641 
642 	physmap_idx = *physmap_idxp;
643 
644 	if (length == 0)
645 		return (1);
646 
647 	/*
648 	 * Find insertion point while checking for overlap.  Start off by
649 	 * assuming the new entry will be added to the end.
650 	 *
651 	 * NB: physmap_idx points to the next free slot.
652 	 */
653 	insert_idx = physmap_idx;
654 	for (i = 0; i < physmap_idx; i += 2) {
655 		if (base < physmap[i + 1]) {
656 			if (base + length <= physmap[i]) {
657 				insert_idx = i;
658 				break;
659 			}
660 			if (boothowto & RB_VERBOSE)
661 				printf(
662 		    "Overlapping memory regions, ignoring second region\n");
663 			return (1);
664 		}
665 	}
666 
667 	/* See if we can prepend to the next entry. */
668 	if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
669 		physmap[insert_idx] = base;
670 		return (1);
671 	}
672 
673 	/* See if we can append to the previous entry. */
674 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
675 		physmap[insert_idx - 1] += length;
676 		return (1);
677 	}
678 
679 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
680 		printf(
681 		"Too many segments in the physical address map, giving up\n");
682 		return (0);
683 	}
684 
685 	/*
686 	 * Move the last 'N' entries down to make room for the new
687 	 * entry if needed.
688 	 */
689 	for (i = physmap_idx; i > insert_idx; i -= 2) {
690 		physmap[i] = physmap[i - 2];
691 		physmap[i + 1] = physmap[i - 1];
692 	}
693 
694 	physmap_idx += 2;
695 	*physmap_idxp = physmap_idx;
696 
697 	/* Insert the new entry. */
698 	physmap[insert_idx] = base;
699 	physmap[insert_idx + 1] = base + length;
700 	return (1);
701 }
702 
703 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)704 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
705                       vm_paddr_t *physmap, int *physmap_idx)
706 {
707 	struct bios_smap *smap, *smapend;
708 
709 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
710 
711 	for (smap = smapbase; smap < smapend; smap++) {
712 		if (boothowto & RB_VERBOSE)
713 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
714 			    smap->type, smap->base, smap->length);
715 
716 		if (smap->type != SMAP_TYPE_MEMORY)
717 			continue;
718 
719 		if (!add_physmap_entry(smap->base, smap->length, physmap,
720 		    physmap_idx))
721 			break;
722 	}
723 }
724 
725 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)726 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
727     int *physmap_idx)
728 {
729 	struct efi_md *map, *p;
730 	const char *type;
731 	size_t efisz;
732 	int ndesc, i;
733 
734 	static const char *types[] = {
735 		"Reserved",
736 		"LoaderCode",
737 		"LoaderData",
738 		"BootServicesCode",
739 		"BootServicesData",
740 		"RuntimeServicesCode",
741 		"RuntimeServicesData",
742 		"ConventionalMemory",
743 		"UnusableMemory",
744 		"ACPIReclaimMemory",
745 		"ACPIMemoryNVS",
746 		"MemoryMappedIO",
747 		"MemoryMappedIOPortSpace",
748 		"PalCode",
749 		"PersistentMemory"
750 	};
751 
752 	/*
753 	 * Memory map data provided by UEFI via the GetMemoryMap
754 	 * Boot Services API.
755 	 */
756 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
757 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
758 
759 	if (efihdr->descriptor_size == 0)
760 		return;
761 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
762 
763 	if (boothowto & RB_VERBOSE)
764 		printf("%23s %12s %12s %8s %4s\n",
765 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
766 
767 	TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
768 	for (i = 0, p = map; i < ndesc; i++,
769 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
770 		if (boothowto & RB_VERBOSE) {
771 			if (p->md_type < nitems(types))
772 				type = types[p->md_type];
773 			else
774 				type = "<INVALID>";
775 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
776 			    p->md_virt, p->md_pages);
777 			if (p->md_attr & EFI_MD_ATTR_UC)
778 				printf("UC ");
779 			if (p->md_attr & EFI_MD_ATTR_WC)
780 				printf("WC ");
781 			if (p->md_attr & EFI_MD_ATTR_WT)
782 				printf("WT ");
783 			if (p->md_attr & EFI_MD_ATTR_WB)
784 				printf("WB ");
785 			if (p->md_attr & EFI_MD_ATTR_UCE)
786 				printf("UCE ");
787 			if (p->md_attr & EFI_MD_ATTR_WP)
788 				printf("WP ");
789 			if (p->md_attr & EFI_MD_ATTR_RP)
790 				printf("RP ");
791 			if (p->md_attr & EFI_MD_ATTR_XP)
792 				printf("XP ");
793 			if (p->md_attr & EFI_MD_ATTR_NV)
794 				printf("NV ");
795 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
796 				printf("MORE_RELIABLE ");
797 			if (p->md_attr & EFI_MD_ATTR_RO)
798 				printf("RO ");
799 			if (p->md_attr & EFI_MD_ATTR_RT)
800 				printf("RUNTIME");
801 			printf("\n");
802 		}
803 
804 		switch (p->md_type) {
805 		case EFI_MD_TYPE_BS_CODE:
806 		case EFI_MD_TYPE_BS_DATA:
807 			if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
808 				continue;
809 			/* FALLTHROUGH */
810 		case EFI_MD_TYPE_CODE:
811 		case EFI_MD_TYPE_DATA:
812 		case EFI_MD_TYPE_FREE:
813 			/*
814 			 * We're allowed to use any entry with these types.
815 			 */
816 			break;
817 		default:
818 			continue;
819 		}
820 
821 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
822 		    physmap, physmap_idx))
823 			break;
824 	}
825 }
826 
827 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)828 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
829 {
830 	struct bios_smap *smap;
831 	struct efi_map_header *efihdr;
832 	u_int32_t size;
833 
834 	/*
835 	 * Memory map from INT 15:E820.
836 	 *
837 	 * subr_module.c says:
838 	 * "Consumer may safely assume that size value precedes data."
839 	 * ie: an int32_t immediately precedes smap.
840 	 */
841 
842 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
843 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
844 	smap = (struct bios_smap *)preload_search_info(preload_kmdp,
845 	    MODINFO_METADATA | MODINFOMD_SMAP);
846 	if (efihdr == NULL && smap == NULL)
847 		panic("No BIOS smap or EFI map info from loader!");
848 
849 	if (efihdr != NULL) {
850 		add_efi_map_entries(efihdr, physmap, physmap_idx);
851 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
852 	} else {
853 		size = *((u_int32_t *)smap - 1);
854 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
855 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
856 	}
857 }
858 
859 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
860 
861 /*
862  * Populate the (physmap) array with base/bound pairs describing the
863  * available physical memory in the system, then test this memory and
864  * build the phys_avail array describing the actually-available memory.
865  *
866  * Total memory size may be set by the kernel environment variable
867  * hw.physmem or the compile-time define MAXMEM.
868  *
869  * XXX first should be vm_paddr_t.
870  */
871 static void
getmemsize(u_int64_t first)872 getmemsize(u_int64_t first)
873 {
874 	int i, physmap_idx, pa_indx, da_indx;
875 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
876 	u_long physmem_start, physmem_tunable, memtest;
877 	pt_entry_t *pte;
878 	quad_t dcons_addr, dcons_size;
879 	int page_counter;
880 
881 	TSENTER();
882 	/*
883 	 * Tell the physical memory allocator about pages used to store
884 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
885 	 */
886 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
887 
888 	bzero(physmap, sizeof(physmap));
889 	physmap_idx = 0;
890 
891 	init_ops.parse_memmap(physmap, &physmap_idx);
892 	physmap_idx -= 2;
893 
894 	/*
895 	 * Find the 'base memory' segment for SMP
896 	 */
897 	basemem = 0;
898 	for (i = 0; i <= physmap_idx; i += 2) {
899 		if (physmap[i] <= 0xA0000) {
900 			basemem = physmap[i + 1] / 1024;
901 			break;
902 		}
903 	}
904 	if (basemem == 0 || basemem > 640) {
905 		if (bootverbose)
906 			printf(
907 		"Memory map doesn't contain a basemem segment, faking it");
908 		basemem = 640;
909 	}
910 
911 	/*
912 	 * Maxmem isn't the "maximum memory", it's one larger than the
913 	 * highest page of the physical address space.  It should be
914 	 * called something like "Maxphyspage".  We may adjust this
915 	 * based on ``hw.physmem'' and the results of the memory test.
916 	 */
917 	Maxmem = atop(physmap[physmap_idx + 1]);
918 
919 #ifdef MAXMEM
920 	Maxmem = MAXMEM / 4;
921 #endif
922 
923 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
924 		Maxmem = atop(physmem_tunable);
925 
926 	/*
927 	 * The boot memory test is disabled by default, as it takes a
928 	 * significant amount of time on large-memory systems, and is
929 	 * unfriendly to virtual machines as it unnecessarily touches all
930 	 * pages.
931 	 *
932 	 * A general name is used as the code may be extended to support
933 	 * additional tests beyond the current "page present" test.
934 	 */
935 	memtest = 0;
936 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
937 
938 	/*
939 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
940 	 * in the system.
941 	 */
942 	if (Maxmem > atop(physmap[physmap_idx + 1]))
943 		Maxmem = atop(physmap[physmap_idx + 1]);
944 
945 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
946 	    (boothowto & RB_VERBOSE))
947 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
948 
949 	/* call pmap initialization to make new kernel address space */
950 	pmap_bootstrap(&first);
951 
952 	/*
953 	 * Size up each available chunk of physical memory.
954 	 *
955 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
956 	 * By default, mask off the first 16 pages unless we appear to be
957 	 * running in a VM.
958 	 */
959 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
960 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
961 	if (physmap[0] < physmem_start) {
962 		if (physmem_start < PAGE_SIZE)
963 			physmap[0] = PAGE_SIZE;
964 		else if (physmem_start >= physmap[1])
965 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
966 		else
967 			physmap[0] = round_page(physmem_start);
968 	}
969 	pa_indx = 0;
970 	da_indx = 1;
971 	phys_avail[pa_indx++] = physmap[0];
972 	phys_avail[pa_indx] = physmap[0];
973 	dump_avail[da_indx] = physmap[0];
974 	pte = CMAP1;
975 
976 	/*
977 	 * Get dcons buffer address
978 	 */
979 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
980 	    getenv_quad("dcons.size", &dcons_size) == 0)
981 		dcons_addr = 0;
982 
983 	/*
984 	 * physmap is in bytes, so when converting to page boundaries,
985 	 * round up the start address and round down the end address.
986 	 */
987 	page_counter = 0;
988 	if (memtest != 0)
989 		printf("Testing system memory");
990 	for (i = 0; i <= physmap_idx; i += 2) {
991 		vm_paddr_t end;
992 
993 		end = ptoa((vm_paddr_t)Maxmem);
994 		if (physmap[i + 1] < end)
995 			end = trunc_page(physmap[i + 1]);
996 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
997 			int *ptr = (int *)CADDR1;
998 			int tmp;
999 			bool full, page_bad;
1000 
1001 			full = false;
1002 			/*
1003 			 * block out kernel memory as not available.
1004 			 */
1005 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1006 				goto do_dump_avail;
1007 
1008 			/*
1009 			 * block out dcons buffer
1010 			 */
1011 			if (dcons_addr > 0
1012 			    && pa >= trunc_page(dcons_addr)
1013 			    && pa < dcons_addr + dcons_size)
1014 				goto do_dump_avail;
1015 
1016 			page_bad = false;
1017 			if (memtest == 0)
1018 				goto skip_memtest;
1019 
1020 			/*
1021 			 * Print a "." every GB to show we're making
1022 			 * progress.
1023 			 */
1024 			page_counter++;
1025 			if ((page_counter % PAGES_PER_GB) == 0)
1026 				printf(".");
1027 
1028 			/*
1029 			 * map page into kernel: valid, read/write,non-cacheable
1030 			 */
1031 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1032 			invltlb();
1033 
1034 			tmp = *(int *)ptr;
1035 			/*
1036 			 * Test for alternating 1's and 0's
1037 			 */
1038 			*(volatile int *)ptr = 0xaaaaaaaa;
1039 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1040 				page_bad = true;
1041 			/*
1042 			 * Test for alternating 0's and 1's
1043 			 */
1044 			*(volatile int *)ptr = 0x55555555;
1045 			if (*(volatile int *)ptr != 0x55555555)
1046 				page_bad = true;
1047 			/*
1048 			 * Test for all 1's
1049 			 */
1050 			*(volatile int *)ptr = 0xffffffff;
1051 			if (*(volatile int *)ptr != 0xffffffff)
1052 				page_bad = true;
1053 			/*
1054 			 * Test for all 0's
1055 			 */
1056 			*(volatile int *)ptr = 0x0;
1057 			if (*(volatile int *)ptr != 0x0)
1058 				page_bad = true;
1059 			/*
1060 			 * Restore original value.
1061 			 */
1062 			*(int *)ptr = tmp;
1063 
1064 skip_memtest:
1065 			/*
1066 			 * Adjust array of valid/good pages.
1067 			 */
1068 			if (page_bad == true)
1069 				continue;
1070 			/*
1071 			 * If this good page is a continuation of the
1072 			 * previous set of good pages, then just increase
1073 			 * the end pointer. Otherwise start a new chunk.
1074 			 * Note that "end" points one higher than end,
1075 			 * making the range >= start and < end.
1076 			 * If we're also doing a speculative memory
1077 			 * test and we at or past the end, bump up Maxmem
1078 			 * so that we keep going. The first bad page
1079 			 * will terminate the loop.
1080 			 */
1081 			if (phys_avail[pa_indx] == pa) {
1082 				phys_avail[pa_indx] += PAGE_SIZE;
1083 			} else {
1084 				pa_indx++;
1085 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1086 					printf(
1087 		"Too many holes in the physical address space, giving up\n");
1088 					pa_indx--;
1089 					full = true;
1090 					goto do_dump_avail;
1091 				}
1092 				phys_avail[pa_indx++] = pa;	/* start */
1093 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1094 			}
1095 			physmem++;
1096 do_dump_avail:
1097 			if (dump_avail[da_indx] == pa) {
1098 				dump_avail[da_indx] += PAGE_SIZE;
1099 			} else {
1100 				da_indx++;
1101 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1102 					da_indx--;
1103 					goto do_next;
1104 				}
1105 				dump_avail[da_indx++] = pa; /* start */
1106 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1107 			}
1108 do_next:
1109 			if (full)
1110 				break;
1111 		}
1112 	}
1113 	*pte = 0;
1114 	invltlb();
1115 	if (memtest != 0)
1116 		printf("\n");
1117 
1118 	/*
1119 	 * XXX
1120 	 * The last chunk must contain at least one page plus the message
1121 	 * buffer to avoid complicating other code (message buffer address
1122 	 * calculation, etc.).
1123 	 */
1124 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1125 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1126 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1127 		phys_avail[pa_indx--] = 0;
1128 		phys_avail[pa_indx--] = 0;
1129 	}
1130 
1131 	Maxmem = atop(phys_avail[pa_indx]);
1132 
1133 	/* Trim off space for the message buffer. */
1134 	phys_avail[pa_indx] -= round_page(msgbufsize);
1135 
1136 	/* Map the message buffer. */
1137 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1138 	TSEXIT();
1139 }
1140 
1141 static void
native_parse_preload_data(u_int64_t modulep)1142 native_parse_preload_data(u_int64_t modulep)
1143 {
1144 	char *envp;
1145 #ifdef DDB
1146 	vm_offset_t ksym_start;
1147 	vm_offset_t ksym_end;
1148 #endif
1149 
1150 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1151 	preload_bootstrap_relocate(KERNBASE);
1152 	preload_initkmdp(true);
1153 	boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1154 	envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1155 	if (envp != NULL)
1156 		envp += KERNBASE;
1157 	init_static_kenv(envp, 0);
1158 #ifdef DDB
1159 	ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1160 	ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1161 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1162 #endif
1163 	efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1164 	    vm_paddr_t);
1165 }
1166 
1167 static void
native_clock_source_init(void)1168 native_clock_source_init(void)
1169 {
1170 	i8254_init();
1171 }
1172 
1173 static void
amd64_kdb_init(void)1174 amd64_kdb_init(void)
1175 {
1176 	kdb_init();
1177 #ifdef KDB
1178 	if (boothowto & RB_KDB)
1179 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1180 #endif
1181 }
1182 
1183 /* Set up the fast syscall stuff */
1184 void
amd64_conf_fast_syscall(void)1185 amd64_conf_fast_syscall(void)
1186 {
1187 	uint64_t msr;
1188 
1189 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1190 	wrmsr(MSR_EFER, msr);
1191 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1192 	    (u_int64_t)IDTVEC(fast_syscall));
1193 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1194 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1195 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1196 	wrmsr(MSR_STAR, msr);
1197 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1198 }
1199 
1200 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1201 amd64_bsp_pcpu_init1(struct pcpu *pc)
1202 {
1203 	struct user_segment_descriptor *gdt;
1204 
1205 	PCPU_SET(prvspace, pc);
1206 	gdt = *PCPU_PTR(gdt);
1207 	PCPU_SET(curthread, &thread0);
1208 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1209 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1210 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1211 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1212 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1213 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1214 	PCPU_SET(smp_tlb_gen, 1);
1215 }
1216 
1217 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1218 amd64_bsp_pcpu_init2(uint64_t rsp0)
1219 {
1220 
1221 	PCPU_SET(rsp0, rsp0);
1222 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1223 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1224 	PCPU_SET(curpcb, thread0.td_pcb);
1225 }
1226 
1227 void
amd64_bsp_ist_init(struct pcpu * pc)1228 amd64_bsp_ist_init(struct pcpu *pc)
1229 {
1230 	struct nmi_pcpu *np;
1231 	struct amd64tss *tssp;
1232 
1233 	tssp = &pc->pc_common_tss;
1234 
1235 	/* doublefault stack space, runs on ist1 */
1236 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1237 	np->np_pcpu = (register_t)pc;
1238 	tssp->tss_ist1 = (long)np;
1239 
1240 	/*
1241 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1242 	 * above the start of the ist2 stack.
1243 	 */
1244 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1245 	np->np_pcpu = (register_t)pc;
1246 	tssp->tss_ist2 = (long)np;
1247 
1248 	/*
1249 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1250 	 * above the start of the ist3 stack.
1251 	 */
1252 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1253 	np->np_pcpu = (register_t)pc;
1254 	tssp->tss_ist3 = (long)np;
1255 
1256 	/*
1257 	 * DB# stack, runs on ist4.
1258 	 */
1259 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1260 	np->np_pcpu = (register_t)pc;
1261 	tssp->tss_ist4 = (long)np;
1262 }
1263 
1264 /*
1265  * Calculate the kernel load address by inspecting page table created by loader.
1266  * The assumptions:
1267  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1268  *   aligned at 2M, below 4G (the latter is important for AP startup)
1269  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1270  * - kernel is mapped with 2M superpages
1271  * - all participating memory, i.e. kernel, modules, metadata,
1272  *   page table is accessible by pre-created 1:1 mapping
1273  *   (right now loader creates 1:1 mapping for lower 4G, and all
1274  *   memory is from there)
1275  * - there is a usable memory block right after the end of the
1276  *   mapped kernel and all modules/metadata, pointed to by
1277  *   physfree, for early allocations
1278  */
1279 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1280 amd64_loadaddr(void)
1281 {
1282 	pml4_entry_t *pml4e;
1283 	pdp_entry_t *pdpe;
1284 	pd_entry_t *pde;
1285 	uint64_t cr3;
1286 
1287 	cr3 = rcr3();
1288 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1289 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1290 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1291 	return (*pde & PG_FRAME);
1292 }
1293 
1294 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1295 hammer_time(u_int64_t modulep, u_int64_t physfree)
1296 {
1297 	int gsel_tss, x;
1298 	struct pcpu *pc;
1299 	uint64_t rsp0;
1300 	char *env;
1301 	struct user_segment_descriptor *gdt;
1302 	struct region_descriptor r_gdt;
1303 	size_t kstack0_sz;
1304 
1305 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1306 
1307 	kernphys = amd64_loadaddr();
1308 
1309 	physfree += kernphys;
1310 
1311 	/* Initializes preload_kmdp */
1312 	init_ops.parse_preload_data(modulep);
1313 
1314 	efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1315 	    MODINFOMD_EFI_MAP) != NULL;
1316 
1317 	if (!efi_boot) {
1318 		/* Tell the bios to warmboot next time */
1319 		atomic_store_short((u_short *)0x472, 0x1234);
1320 	}
1321 
1322 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1323 	physfree = roundup2(physfree, PAGE_SIZE);
1324 
1325 	identify_cpu1();
1326 	identify_hypervisor();
1327 	identify_hypervisor_smbios();
1328 	identify_cpu_fixup_bsp();
1329 	identify_cpu2();
1330 	initializecpucache();
1331 
1332 	/*
1333 	 * Check for pti, pcid, and invpcid before ifuncs are
1334 	 * resolved, to correctly select the implementation for
1335 	 * pmap_activate_sw_mode().
1336 	 */
1337 	pti = pti_get_default();
1338 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1339 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1340 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1341 		invpcid_works = (cpu_stdext_feature &
1342 		    CPUID_STDEXT_INVPCID) != 0;
1343 	} else {
1344 		pmap_pcid_enabled = 0;
1345 	}
1346 
1347 	/*
1348 	 * Now we can do small core initialization, after the PCID
1349 	 * CPU features and user knobs are evaluated.
1350 	 */
1351 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1352 	    &pmap_pcid_invlpg_workaround_uena);
1353 	cpu_init_small_core();
1354 
1355 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1356 		use_xsave = 1;
1357 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1358 	}
1359 
1360 	link_elf_ireloc();
1361 
1362 	/*
1363 	 * This may be done better later if it gets more high level
1364 	 * components in it. If so just link td->td_proc here.
1365 	 */
1366 	proc_linkup0(&proc0, &thread0);
1367 
1368 	/* Init basic tunables, hz etc */
1369 	init_param1();
1370 
1371 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1372 	thread0.td_kstack_pages = kstack_pages;
1373 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1374 	bzero((void *)thread0.td_kstack, kstack0_sz);
1375 	physfree += kstack0_sz;
1376 
1377 	/*
1378 	 * Initialize enough of thread0 for delayed invalidation to
1379 	 * work very early.  Rely on thread0.td_base_pri
1380 	 * zero-initialization, it is reset to PVM at proc0_init().
1381 	 */
1382 	pmap_thread_init_invl_gen(&thread0);
1383 
1384 	pc = &temp_bsp_pcpu;
1385 	pcpu_init(pc, 0, sizeof(struct pcpu));
1386 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1387 
1388 	/*
1389 	 * make gdt memory segments
1390 	 */
1391 	for (x = 0; x < NGDT; x++) {
1392 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1393 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1394 			ssdtosd(&gdt_segs[x], &gdt[x]);
1395 	}
1396 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1397 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1398 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1399 
1400 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1401 	r_gdt.rd_base = (long)gdt;
1402 	lgdt(&r_gdt);
1403 
1404 	wrmsr(MSR_FSBASE, 0);		/* User value */
1405 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1406 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1407 
1408 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1409 	physfree += DPCPU_SIZE;
1410 	amd64_bsp_pcpu_init1(pc);
1411 	/* Non-late cninit() and printf() can be moved up to here. */
1412 
1413 	/*
1414 	 * Initialize mutexes.
1415 	 *
1416 	 * icu_lock: in order to allow an interrupt to occur in a critical
1417 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1418 	 *	     must be able to get the icu lock, so it can't be
1419 	 *	     under witness.
1420 	 */
1421 	mutex_init();
1422 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1423 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1424 
1425 	/* exceptions */
1426 	for (x = 0; x < NIDT; x++)
1427 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1428 		    SEL_KPL, 0);
1429 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1430 	    SEL_KPL, 0);
1431 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1432 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1433 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1434 	    SEL_UPL, 0);
1435 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1436 	    SEL_UPL, 0);
1437 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1438 	    SEL_KPL, 0);
1439 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1440 	    SEL_KPL, 0);
1441 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1442 	    SEL_KPL, 0);
1443 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1444 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1445 	    SDT_SYSIGT, SEL_KPL, 0);
1446 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1449 	    SDT_SYSIGT, SEL_KPL, 0);
1450 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1453 	    SEL_KPL, 0);
1454 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1455 	    SEL_KPL, 0);
1456 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1457 	    SEL_KPL, 0);
1458 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1459 	    SEL_KPL, 0);
1460 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1461 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1462 	    SEL_KPL, 0);
1463 #ifdef KDTRACE_HOOKS
1464 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1465 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1466 #endif
1467 #ifdef XENHVM
1468 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1469 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1470 #endif
1471 	r_idt.rd_limit = sizeof(idt0) - 1;
1472 	r_idt.rd_base = (long) idt;
1473 	lidt(&r_idt);
1474 
1475 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1476 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1477 
1478 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1479 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1480 
1481 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1482 	    &syscall_ret_l1d_flush_mode);
1483 
1484 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1485 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1486 
1487 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1488 
1489 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1490 	    &x86_rngds_mitg_enable);
1491 
1492 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1493 	    &zenbleed_enable);
1494 	zenbleed_sanitize_enable();
1495 
1496 	finishidentcpu();	/* Final stage of CPU initialization */
1497 
1498 	invlpgb_works = (amd_extended_feature_extensions &
1499 	    AMDFEID_INVLPGB) != 0;
1500 	TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1501 	if (invlpgb_works)
1502 		invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1503 
1504 	/*
1505 	 * Initialize the clock before the console so that console
1506 	 * initialization can use DELAY().
1507 	 */
1508 	clock_init();
1509 
1510 	initializecpu();	/* Initialize CPU registers */
1511 
1512 	amd64_bsp_ist_init(pc);
1513 
1514 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1515 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1516 	    IOPERM_BITMAP_SIZE;
1517 
1518 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1519 	ltr(gsel_tss);
1520 
1521 	amd64_conf_fast_syscall();
1522 
1523 	/*
1524 	 * We initialize the PCB pointer early so that exception
1525 	 * handlers will work.  Also set up td_critnest to short-cut
1526 	 * the page fault handler.
1527 	 */
1528 	cpu_max_ext_state_size = sizeof(struct savefpu);
1529 	set_top_of_stack_td(&thread0);
1530 	thread0.td_pcb = get_pcb_td(&thread0);
1531 	thread0.td_critnest = 1;
1532 
1533 	/*
1534 	 * The console and kdb should be initialized even earlier than here,
1535 	 * but some console drivers don't work until after getmemsize().
1536 	 * Default to late console initialization to support these drivers.
1537 	 * This loses mainly printf()s in getmemsize() and early debugging.
1538 	 */
1539 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1540 	if (!late_console) {
1541 		cninit();
1542 		amd64_kdb_init();
1543 	}
1544 
1545 	getmemsize(physfree);
1546 	init_param2(physmem);
1547 
1548 	/* now running on new page tables, configured,and u/iom is accessible */
1549 
1550 #ifdef DEV_PCI
1551         /* This call might adjust phys_avail[]. */
1552         pci_early_quirks();
1553 #endif
1554 
1555 	if (late_console)
1556 		cninit();
1557 
1558 	/*
1559 	 * Dump the boot metadata. We have to wait for cninit() since console
1560 	 * output is required. If it's grossly incorrect the kernel will never
1561 	 * make it this far.
1562 	 */
1563 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1564 		preload_dump();
1565 
1566 #ifdef DEV_ISA
1567 #ifdef DEV_ATPIC
1568 	elcr_probe();
1569 	atpic_startup();
1570 #else
1571 	/* Reset and mask the atpics and leave them shut down. */
1572 	atpic_reset();
1573 
1574 	/*
1575 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1576 	 * interrupt handler.
1577 	 */
1578 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1579 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1580 #endif
1581 #else
1582 #error "have you forgotten the isa device?"
1583 #endif
1584 
1585 	if (late_console)
1586 		amd64_kdb_init();
1587 
1588 	msgbufinit(msgbufp, msgbufsize);
1589 	fpuinit();
1590 
1591 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1592 	rsp0 = thread0.td_md.md_stack_base;
1593 	/* Ensure the stack is aligned to 16 bytes */
1594 	rsp0 &= ~0xFul;
1595 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1596 	amd64_bsp_pcpu_init2(rsp0);
1597 
1598 	/* transfer to user mode */
1599 
1600 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1601 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1602 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1603 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1604 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1605 
1606 	load_ds(_udatasel);
1607 	load_es(_udatasel);
1608 	load_fs(_ufssel);
1609 
1610 	/* setup proc 0's pcb */
1611 	thread0.td_pcb->pcb_flags = 0;
1612 
1613         env = kern_getenv("kernelname");
1614 	if (env != NULL)
1615 		strlcpy(kernelname, env, sizeof(kernelname));
1616 
1617 	kcsan_cpu_init(0);
1618 
1619 #ifdef FDT
1620 	x86_init_fdt();
1621 #endif
1622 	thread0.td_critnest = 0;
1623 
1624 	kasan_init();
1625 	kmsan_init();
1626 
1627 	TSEXIT();
1628 
1629 	/* Location of kernel stack for locore */
1630 	return (thread0.td_md.md_stack_base);
1631 }
1632 
1633 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1634 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1635 {
1636 
1637 	pcpu->pc_acpi_id = 0xffffffff;
1638 }
1639 
1640 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1641 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1642 {
1643 	struct bios_smap *smapbase;
1644 	struct bios_smap_xattr smap;
1645 	uint32_t *smapattr;
1646 	int count, error, i;
1647 
1648 	/* Retrieve the system memory map from the loader. */
1649 	smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1650 	    MODINFO_METADATA | MODINFOMD_SMAP);
1651 	if (smapbase == NULL)
1652 		return (0);
1653 	smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1654 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1655 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1656 	error = 0;
1657 	for (i = 0; i < count; i++) {
1658 		smap.base = smapbase[i].base;
1659 		smap.length = smapbase[i].length;
1660 		smap.type = smapbase[i].type;
1661 		if (smapattr != NULL)
1662 			smap.xattr = smapattr[i];
1663 		else
1664 			smap.xattr = 0;
1665 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1666 	}
1667 	return (error);
1668 }
1669 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1670     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1671     smap_sysctl_handler, "S,bios_smap_xattr",
1672     "Raw BIOS SMAP data");
1673 
1674 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1675 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1676 {
1677 	struct efi_map_header *efihdr;
1678 	uint32_t efisize;
1679 
1680 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1681 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1682 	if (efihdr == NULL)
1683 		return (0);
1684 	efisize = *((uint32_t *)efihdr - 1);
1685 	return (SYSCTL_OUT(req, efihdr, efisize));
1686 }
1687 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1688     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1689     efi_map_sysctl_handler, "S,efi_map_header",
1690     "Raw EFI Memory Map");
1691 
1692 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1693 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1694 {
1695 	char *arch;
1696 
1697 	arch = (char *)preload_search_info(preload_kmdp,
1698 	    MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1699 	if (arch == NULL)
1700 		return (0);
1701 
1702 	return (SYSCTL_OUT_STR(req, arch));
1703 }
1704 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1705     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1706     efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1707 
1708 void
spinlock_enter(void)1709 spinlock_enter(void)
1710 {
1711 	struct thread *td;
1712 	register_t flags;
1713 
1714 	td = curthread;
1715 	if (td->td_md.md_spinlock_count == 0) {
1716 		flags = intr_disable();
1717 		td->td_md.md_spinlock_count = 1;
1718 		td->td_md.md_saved_flags = flags;
1719 		critical_enter();
1720 	} else
1721 		td->td_md.md_spinlock_count++;
1722 }
1723 
1724 void
spinlock_exit(void)1725 spinlock_exit(void)
1726 {
1727 	struct thread *td;
1728 	register_t flags;
1729 
1730 	td = curthread;
1731 	flags = td->td_md.md_saved_flags;
1732 	td->td_md.md_spinlock_count--;
1733 	if (td->td_md.md_spinlock_count == 0) {
1734 		critical_exit();
1735 		intr_restore(flags);
1736 	}
1737 }
1738 
1739 /*
1740  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1741  * we want to start a backtrace from the function that caused us to enter
1742  * the debugger. We have the context in the trapframe, but base the trace
1743  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1744  * enough for a backtrace.
1745  */
1746 void
makectx(struct trapframe * tf,struct pcb * pcb)1747 makectx(struct trapframe *tf, struct pcb *pcb)
1748 {
1749 
1750 	pcb->pcb_r12 = tf->tf_r12;
1751 	pcb->pcb_r13 = tf->tf_r13;
1752 	pcb->pcb_r14 = tf->tf_r14;
1753 	pcb->pcb_r15 = tf->tf_r15;
1754 	pcb->pcb_rbp = tf->tf_rbp;
1755 	pcb->pcb_rbx = tf->tf_rbx;
1756 	pcb->pcb_rip = tf->tf_rip;
1757 	pcb->pcb_rsp = tf->tf_rsp;
1758 }
1759 
1760 /*
1761  * The pcb_flags is only modified by current thread, or by other threads
1762  * when current thread is stopped.  However, current thread may change it
1763  * from the interrupt context in cpu_switch(), or in the trap handler.
1764  * When we read-modify-write pcb_flags from C sources, compiler may generate
1765  * code that is not atomic regarding the interrupt handler.  If a trap or
1766  * interrupt happens and any flag is modified from the handler, it can be
1767  * clobbered with the cached value later.  Therefore, we implement setting
1768  * and clearing flags with single-instruction functions, which do not race
1769  * with possible modification of the flags from the trap or interrupt context,
1770  * because traps and interrupts are executed only on instruction boundary.
1771  */
1772 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1773 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1774 {
1775 
1776 	__asm __volatile("orl %1,%0"
1777 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1778 	    : "cc", "memory");
1779 
1780 }
1781 
1782 /*
1783  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1784  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1785  * pcb if user space modified the bases.  We must save on the context
1786  * switch or if the return to usermode happens through the doreti.
1787  *
1788  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1789  * which have a consequence that the base MSRs must be saved each time
1790  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1791  * context switches.
1792  */
1793 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1794 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1795 {
1796 	register_t r;
1797 
1798 	if (curpcb == pcb &&
1799 	    (flags & PCB_FULL_IRET) != 0 &&
1800 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1801 		r = intr_disable();
1802 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1803 			pcb->pcb_fsbase = rdfsbase();
1804 			pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1805 		}
1806 		set_pcb_flags_raw(pcb, flags);
1807 		intr_restore(r);
1808 	} else {
1809 		set_pcb_flags_raw(pcb, flags);
1810 	}
1811 }
1812 
1813 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1814 {
1815 
1816 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1817 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1818 }
1819 
1820 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1821 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1822 {
1823 
1824 	__asm __volatile("andl %1,%0"
1825 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1826 	    : "cc", "memory");
1827 }
1828 
1829 #ifdef KDB
1830 
1831 /*
1832  * Provide inb() and outb() as functions.  They are normally only available as
1833  * inline functions, thus cannot be called from the debugger.
1834  */
1835 
1836 /* silence compiler warnings */
1837 u_char inb_(u_short);
1838 void outb_(u_short, u_char);
1839 
1840 u_char
inb_(u_short port)1841 inb_(u_short port)
1842 {
1843 	return inb(port);
1844 }
1845 
1846 void
outb_(u_short port,u_char data)1847 outb_(u_short port, u_char data)
1848 {
1849 	outb(port, data);
1850 }
1851 
1852 #endif /* KDB */
1853 
1854 #undef memset
1855 #undef memmove
1856 #undef memcpy
1857 
1858 void	*memset_std(void *buf, int c, size_t len);
1859 void	*memset_erms(void *buf, int c, size_t len);
1860 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1861 	    size_t len);
1862 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1863 	    size_t len);
1864 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1865 	    size_t len);
1866 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1867 	    size_t len);
1868 
1869 #ifdef KCSAN
1870 /*
1871  * These fail to build as ifuncs when used with KCSAN.
1872  */
1873 void *
memset(void * buf,int c,size_t len)1874 memset(void *buf, int c, size_t len)
1875 {
1876 
1877 	return (memset_std(buf, c, len));
1878 }
1879 
1880 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1881 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1882 {
1883 
1884 	return (memmove_std(dst, src, len));
1885 }
1886 
1887 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1888 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1889 {
1890 
1891 	return (memcpy_std(dst, src, len));
1892 }
1893 #else
1894 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1895 {
1896 
1897 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1898 	    memset_erms : memset_std);
1899 }
1900 
1901 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1902     size_t))
1903 {
1904 
1905 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1906 	    memmove_erms : memmove_std);
1907 }
1908 
1909 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1910 {
1911 
1912 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1913 	    memcpy_erms : memcpy_std);
1914 }
1915 #endif
1916 
1917 void	pagezero_std(void *addr);
1918 void	pagezero_erms(void *addr);
1919 DEFINE_IFUNC(, void , pagezero, (void *))
1920 {
1921 
1922 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1923 	    pagezero_erms : pagezero_std);
1924 }
1925