xref: /freebsd/sys/amd64/amd64/machdep.c (revision 6fb848f2ff91337dbb26024ab41103ee2b036021)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include "opt_atpic.h"
42 #include "opt_cpu.h"
43 #include "opt_ddb.h"
44 #include "opt_inet.h"
45 #include "opt_isa.h"
46 #include "opt_kstack_pages.h"
47 #include "opt_maxmem.h"
48 #include "opt_pci.h"
49 #include "opt_platform.h"
50 #include "opt_sched.h"
51 
52 #include <sys/param.h>
53 #include <sys/proc.h>
54 #include <sys/systm.h>
55 #include <sys/asan.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/bus.h>
59 #include <sys/callout.h>
60 #include <sys/cons.h>
61 #include <sys/cpu.h>
62 #include <sys/csan.h>
63 #include <sys/efi.h>
64 #include <sys/eventhandler.h>
65 #include <sys/exec.h>
66 #include <sys/imgact.h>
67 #include <sys/kdb.h>
68 #include <sys/kernel.h>
69 #include <sys/ktr.h>
70 #include <sys/linker.h>
71 #include <sys/lock.h>
72 #include <sys/malloc.h>
73 #include <sys/memrange.h>
74 #include <sys/msan.h>
75 #include <sys/msgbuf.h>
76 #include <sys/mutex.h>
77 #include <sys/pcpu.h>
78 #include <sys/ptrace.h>
79 #include <sys/reboot.h>
80 #include <sys/reg.h>
81 #include <sys/rwlock.h>
82 #include <sys/sched.h>
83 #include <sys/signalvar.h>
84 #include <sys/smp.h>
85 #include <sys/syscallsubr.h>
86 #include <sys/sysctl.h>
87 #include <sys/sysent.h>
88 #include <sys/sysproto.h>
89 #include <sys/ucontext.h>
90 #include <sys/vmmeter.h>
91 
92 #include <vm/vm.h>
93 #include <vm/vm_param.h>
94 #include <vm/vm_extern.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_pager.h>
100 #include <vm/vm_phys.h>
101 #include <vm/vm_dumpset.h>
102 
103 #ifdef DDB
104 #ifndef KDB
105 #error KDB must be enabled in order for DDB to work!
106 #endif
107 #include <ddb/ddb.h>
108 #include <ddb/db_sym.h>
109 #endif
110 
111 #include <net/netisr.h>
112 
113 #include <dev/smbios/smbios.h>
114 
115 #include <machine/clock.h>
116 #include <machine/cpu.h>
117 #include <machine/cputypes.h>
118 #include <machine/frame.h>
119 #include <machine/intr_machdep.h>
120 #include <x86/mca.h>
121 #include <machine/md_var.h>
122 #include <machine/metadata.h>
123 #include <machine/pc/bios.h>
124 #include <machine/pcb.h>
125 #include <machine/proc.h>
126 #include <machine/sigframe.h>
127 #include <machine/specialreg.h>
128 #include <machine/trap.h>
129 #include <machine/tss.h>
130 #include <x86/ucode.h>
131 #include <x86/ifunc.h>
132 #include <machine/smp.h>
133 #ifdef FDT
134 #include <x86/fdt.h>
135 #endif
136 
137 #ifdef DEV_ATPIC
138 #include <x86/isa/icu.h>
139 #else
140 #include <x86/apicvar.h>
141 #endif
142 
143 #include <isa/isareg.h>
144 #include <isa/rtc.h>
145 #include <x86/init.h>
146 
147 #ifndef SMP
148 #error amd64 requires options SMP
149 #endif
150 
151 /* Sanity check for __curthread() */
152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153 
154 /*
155  * The PTI trampoline stack needs enough space for a hardware trapframe and a
156  * couple of scratch registers, as well as the trapframe left behind after an
157  * iret fault.
158  */
159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160     offsetof(struct pti_frame, pti_rip));
161 
162 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163 
164 static void cpu_startup(void *);
165 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
166 
167 /* Probe 8254 PIT and TSC. */
168 static void native_clock_source_init(void);
169 
170 /* Preload data parse function */
171 static void native_parse_preload_data(u_int64_t);
172 
173 /* Native function to fetch and parse the e820 map */
174 static void native_parse_memmap(vm_paddr_t *, int *);
175 
176 /* Default init_ops implementation. */
177 struct init_ops init_ops = {
178 	.parse_preload_data =		native_parse_preload_data,
179 	.early_clock_source_init =	native_clock_source_init,
180 	.early_delay =			i8254_delay,
181 	.parse_memmap =			native_parse_memmap,
182 };
183 
184 /*
185  * Physical address of the EFI System Table. Stashed from the metadata hints
186  * passed into the kernel and used by the EFI code to call runtime services.
187  */
188 vm_paddr_t efi_systbl_phys;
189 
190 /*
191  * Bitmap of extra EFI memory region types that should be preserved and mapped
192  * during runtime services calls.
193  */
194 uint32_t efi_map_regs;
195 
196 /* Intel ICH registers */
197 #define ICH_PMBASE	0x400
198 #define ICH_SMI_EN	ICH_PMBASE + 0x30
199 
200 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
201 
202 int cold = 1;
203 
204 long Maxmem = 0;
205 long realmem = 0;
206 int late_console = 1;
207 
208 struct kva_md_info kmi;
209 
210 struct region_descriptor r_idt;
211 
212 struct pcpu *__pcpu;
213 struct pcpu temp_bsp_pcpu;
214 
215 struct mtx icu_lock;
216 
217 struct mem_range_softc mem_range_softc;
218 
219 struct mtx dt_lock;	/* lock for GDT and LDT */
220 
221 void (*vmm_suspend_p)(void);
222 void (*vmm_resume_p)(void);
223 
224 bool efi_boot;
225 
226 static void
cpu_startup(void * dummy)227 cpu_startup(void *dummy)
228 {
229 	uintmax_t memsize;
230 	char *sysenv;
231 
232 	/*
233 	 * On MacBooks, we need to disallow the legacy USB circuit to
234 	 * generate an SMI# because this can cause several problems,
235 	 * namely: incorrect CPU frequency detection and failure to
236 	 * start the APs.
237 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
238 	 * Enable register) of the Intel ICH LPC Interface Bridge.
239 	 */
240 	sysenv = kern_getenv("smbios.system.product");
241 	if (sysenv != NULL) {
242 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
243 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
244 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
245 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
246 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
247 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
248 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
249 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
250 			if (bootverbose)
251 				printf("Disabling LEGACY_USB_EN bit on "
252 				    "Intel ICH.\n");
253 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
254 		}
255 		freeenv(sysenv);
256 	}
257 
258 	/*
259 	 * Good {morning,afternoon,evening,night}.
260 	 */
261 	startrtclock();
262 	printcpuinfo();
263 
264 	/*
265 	 * Display physical memory if SMBIOS reports reasonable amount.
266 	 */
267 	memsize = 0;
268 	sysenv = kern_getenv("smbios.memory.enabled");
269 	if (sysenv != NULL) {
270 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
271 		freeenv(sysenv);
272 	}
273 	if (memsize < ptoa((uintmax_t)vm_free_count()))
274 		memsize = ptoa((uintmax_t)Maxmem);
275 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
276 	realmem = atop(memsize);
277 
278 	/*
279 	 * Display any holes after the first chunk of extended memory.
280 	 */
281 	if (bootverbose) {
282 		int indx;
283 
284 		printf("Physical memory chunk(s):\n");
285 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
286 			vm_paddr_t size;
287 
288 			size = phys_avail[indx + 1] - phys_avail[indx];
289 			printf(
290 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
291 			    (uintmax_t)phys_avail[indx],
292 			    (uintmax_t)phys_avail[indx + 1] - 1,
293 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
294 		}
295 	}
296 
297 	vm_ksubmap_init(&kmi);
298 
299 	printf("avail memory = %ju (%ju MB)\n",
300 	    ptoa((uintmax_t)vm_free_count()),
301 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
302 #ifdef DEV_PCI
303 	if (bootverbose && intel_graphics_stolen_base != 0)
304 		printf("intel stolen mem: base %#jx size %ju MB\n",
305 		    (uintmax_t)intel_graphics_stolen_base,
306 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
307 #endif
308 
309 	/*
310 	 * Set up buffers, so they can be used to read disk labels.
311 	 */
312 	bufinit();
313 	vm_pager_bufferinit();
314 
315 	cpu_setregs();
316 }
317 
318 static void
late_ifunc_resolve(void * dummy __unused)319 late_ifunc_resolve(void *dummy __unused)
320 {
321 	link_elf_late_ireloc();
322 }
323 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
324 
325 
326 void
cpu_setregs(void)327 cpu_setregs(void)
328 {
329 	register_t cr0;
330 
331 	TSENTER();
332 	cr0 = rcr0();
333 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 	TSENTER2("load_cr0");
335 	load_cr0(cr0);
336 	TSEXIT2("load_cr0");
337 	TSEXIT();
338 }
339 
340 /*
341  * Initialize amd64 and configure to run kernel
342  */
343 
344 /*
345  * Initialize segments & interrupt table
346  */
347 static struct gate_descriptor idt0[NIDT];
348 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
349 
350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
354 CTASSERT(sizeof(struct nmi_pcpu) == 16);
355 
356 /*
357  * Software prototypes -- in more palatable form.
358  *
359  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
360  * slots as corresponding segments for i386 kernel.
361  */
362 struct soft_segment_descriptor gdt_segs[] = {
363 [GNULL_SEL] = { /* 0 Null Descriptor */
364 	.ssd_base = 0x0,
365 	.ssd_limit = 0x0,
366 	.ssd_type = 0,
367 	.ssd_dpl = 0,
368 	.ssd_p = 0,
369 	.ssd_long = 0,
370 	.ssd_def32 = 0,
371 	.ssd_gran = 0		},
372 [GNULL2_SEL] = { /*	1 Null Descriptor */
373 	.ssd_base = 0x0,
374 	.ssd_limit = 0x0,
375 	.ssd_type = 0,
376 	.ssd_dpl = 0,
377 	.ssd_p = 0,
378 	.ssd_long = 0,
379 	.ssd_def32 = 0,
380 	.ssd_gran = 0		},
381 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
382 	.ssd_base = 0x0,
383 	.ssd_limit = 0xfffff,
384 	.ssd_type = SDT_MEMRWA,
385 	.ssd_dpl = SEL_UPL,
386 	.ssd_p = 1,
387 	.ssd_long = 0,
388 	.ssd_def32 = 1,
389 	.ssd_gran = 1		},
390 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
391 	.ssd_base = 0x0,
392 	.ssd_limit = 0xfffff,
393 	.ssd_type = SDT_MEMRWA,
394 	.ssd_dpl = SEL_UPL,
395 	.ssd_p = 1,
396 	.ssd_long = 0,
397 	.ssd_def32 = 1,
398 	.ssd_gran = 1		},
399 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
400 	.ssd_base = 0x0,
401 	.ssd_limit = 0xfffff,
402 	.ssd_type = SDT_MEMERA,
403 	.ssd_dpl = SEL_KPL,
404 	.ssd_p = 1,
405 	.ssd_long = 1,
406 	.ssd_def32 = 0,
407 	.ssd_gran = 1		},
408 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
409 	.ssd_base = 0x0,
410 	.ssd_limit = 0xfffff,
411 	.ssd_type = SDT_MEMRWA,
412 	.ssd_dpl = SEL_KPL,
413 	.ssd_p = 1,
414 	.ssd_long = 1,
415 	.ssd_def32 = 0,
416 	.ssd_gran = 1		},
417 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
418 	.ssd_base = 0x0,
419 	.ssd_limit = 0xfffff,
420 	.ssd_type = SDT_MEMERA,
421 	.ssd_dpl = SEL_UPL,
422 	.ssd_p = 1,
423 	.ssd_long = 0,
424 	.ssd_def32 = 1,
425 	.ssd_gran = 1		},
426 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
427 	.ssd_base = 0x0,
428 	.ssd_limit = 0xfffff,
429 	.ssd_type = SDT_MEMRWA,
430 	.ssd_dpl = SEL_UPL,
431 	.ssd_p = 1,
432 	.ssd_long = 0,
433 	.ssd_def32 = 1,
434 	.ssd_gran = 1		},
435 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
436 	.ssd_base = 0x0,
437 	.ssd_limit = 0xfffff,
438 	.ssd_type = SDT_MEMERA,
439 	.ssd_dpl = SEL_UPL,
440 	.ssd_p = 1,
441 	.ssd_long = 1,
442 	.ssd_def32 = 0,
443 	.ssd_gran = 1		},
444 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
445 	.ssd_base = 0x0,
446 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
447 	.ssd_type = SDT_SYSTSS,
448 	.ssd_dpl = SEL_KPL,
449 	.ssd_p = 1,
450 	.ssd_long = 0,
451 	.ssd_def32 = 0,
452 	.ssd_gran = 0		},
453 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
454 	.ssd_base = 0x0,
455 	.ssd_limit = 0x0,
456 	.ssd_type = 0,
457 	.ssd_dpl = 0,
458 	.ssd_p = 0,
459 	.ssd_long = 0,
460 	.ssd_def32 = 0,
461 	.ssd_gran = 0		},
462 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
463 	.ssd_base = 0x0,
464 	.ssd_limit = 0x0,
465 	.ssd_type = 0,
466 	.ssd_dpl = 0,
467 	.ssd_p = 0,
468 	.ssd_long = 0,
469 	.ssd_def32 = 0,
470 	.ssd_gran = 0		},
471 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
472 	.ssd_base = 0x0,
473 	.ssd_limit = 0x0,
474 	.ssd_type = 0,
475 	.ssd_dpl = 0,
476 	.ssd_p = 0,
477 	.ssd_long = 0,
478 	.ssd_def32 = 0,
479 	.ssd_gran = 0		},
480 };
481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
482 
483 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
485 {
486 	struct gate_descriptor *ip;
487 
488 	ip = idt + idx;
489 	ip->gd_looffset = (uintptr_t)func;
490 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
491 	ip->gd_ist = ist;
492 	ip->gd_xx = 0;
493 	ip->gd_type = typ;
494 	ip->gd_dpl = dpl;
495 	ip->gd_p = 1;
496 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
497 }
498 
499 extern inthand_t
500 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
501 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
502 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
503 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
504 	IDTVEC(xmm), IDTVEC(dblfault),
505 	IDTVEC(div_pti), IDTVEC(bpt_pti),
506 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
507 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
508 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
509 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
510 	IDTVEC(xmm_pti),
511 #ifdef KDTRACE_HOOKS
512 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
513 #endif
514 #ifdef XENHVM
515 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
516 #endif
517 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
518 	IDTVEC(fast_syscall_pti);
519 
520 #ifdef DDB
521 /*
522  * Display the index and function name of any IDT entries that don't use
523  * the default 'rsvd' entry point.
524  */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
526 {
527 	struct gate_descriptor *ip;
528 	int idx;
529 	uintptr_t func;
530 
531 	ip = idt;
532 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
533 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
534 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
535 			db_printf("%3d\t", idx);
536 			db_printsym(func, DB_STGY_PROC);
537 			db_printf("\n");
538 		}
539 		ip++;
540 	}
541 }
542 
543 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
545 {
546 	struct {
547 		uint16_t limit;
548 		uint64_t base;
549 	} __packed idtr, gdtr;
550 	uint16_t ldt, tr;
551 
552 	__asm __volatile("sidt %0" : "=m" (idtr));
553 	db_printf("idtr\t0x%016lx/%04x\n",
554 	    (u_long)idtr.base, (u_int)idtr.limit);
555 	__asm __volatile("sgdt %0" : "=m" (gdtr));
556 	db_printf("gdtr\t0x%016lx/%04x\n",
557 	    (u_long)gdtr.base, (u_int)gdtr.limit);
558 	__asm __volatile("sldt %0" : "=r" (ldt));
559 	db_printf("ldtr\t0x%04x\n", ldt);
560 	__asm __volatile("str %0" : "=r" (tr));
561 	db_printf("tr\t0x%04x\n", tr);
562 	db_printf("cr0\t0x%016lx\n", rcr0());
563 	db_printf("cr2\t0x%016lx\n", rcr2());
564 	db_printf("cr3\t0x%016lx\n", rcr3());
565 	db_printf("cr4\t0x%016lx\n", rcr4());
566 	if (rcr4() & CR4_XSAVE)
567 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
568 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
569 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
570 		db_printf("FEATURES_CTL\t%016lx\n",
571 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
572 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
573 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
574 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
575 }
576 
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
578 {
579 
580 	db_printf("dr0\t0x%016lx\n", rdr0());
581 	db_printf("dr1\t0x%016lx\n", rdr1());
582 	db_printf("dr2\t0x%016lx\n", rdr2());
583 	db_printf("dr3\t0x%016lx\n", rdr3());
584 	db_printf("dr6\t0x%016lx\n", rdr6());
585 	db_printf("dr7\t0x%016lx\n", rdr7());
586 }
587 #endif
588 
589 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
591 {
592 
593 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
594 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
595 	ssd->ssd_type  = sd->sd_type;
596 	ssd->ssd_dpl   = sd->sd_dpl;
597 	ssd->ssd_p     = sd->sd_p;
598 	ssd->ssd_long  = sd->sd_long;
599 	ssd->ssd_def32 = sd->sd_def32;
600 	ssd->ssd_gran  = sd->sd_gran;
601 }
602 
603 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
605 {
606 
607 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
608 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
609 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
610 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
611 	sd->sd_type  = ssd->ssd_type;
612 	sd->sd_dpl   = ssd->ssd_dpl;
613 	sd->sd_p     = ssd->ssd_p;
614 	sd->sd_long  = ssd->ssd_long;
615 	sd->sd_def32 = ssd->ssd_def32;
616 	sd->sd_gran  = ssd->ssd_gran;
617 }
618 
619 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
621 {
622 
623 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
624 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
625 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
626 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
627 	sd->sd_type  = ssd->ssd_type;
628 	sd->sd_dpl   = ssd->ssd_dpl;
629 	sd->sd_p     = ssd->ssd_p;
630 	sd->sd_gran  = ssd->ssd_gran;
631 }
632 
633 u_int basemem;
634 
635 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
637     int *physmap_idxp)
638 {
639 	int i, insert_idx, physmap_idx;
640 
641 	physmap_idx = *physmap_idxp;
642 
643 	if (length == 0)
644 		return (1);
645 
646 	/*
647 	 * Find insertion point while checking for overlap.  Start off by
648 	 * assuming the new entry will be added to the end.
649 	 *
650 	 * NB: physmap_idx points to the next free slot.
651 	 */
652 	insert_idx = physmap_idx;
653 	for (i = 0; i < physmap_idx; i += 2) {
654 		if (base < physmap[i + 1]) {
655 			if (base + length <= physmap[i]) {
656 				insert_idx = i;
657 				break;
658 			}
659 			if (boothowto & RB_VERBOSE)
660 				printf(
661 		    "Overlapping memory regions, ignoring second region\n");
662 			return (1);
663 		}
664 	}
665 
666 	/* See if we can prepend to the next entry. */
667 	if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
668 		physmap[insert_idx] = base;
669 		return (1);
670 	}
671 
672 	/* See if we can append to the previous entry. */
673 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
674 		physmap[insert_idx - 1] += length;
675 		return (1);
676 	}
677 
678 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
679 		printf(
680 		"Too many segments in the physical address map, giving up\n");
681 		return (0);
682 	}
683 
684 	/*
685 	 * Move the last 'N' entries down to make room for the new
686 	 * entry if needed.
687 	 */
688 	for (i = physmap_idx; i > insert_idx; i -= 2) {
689 		physmap[i] = physmap[i - 2];
690 		physmap[i + 1] = physmap[i - 1];
691 	}
692 
693 	physmap_idx += 2;
694 	*physmap_idxp = physmap_idx;
695 
696 	/* Insert the new entry. */
697 	physmap[insert_idx] = base;
698 	physmap[insert_idx + 1] = base + length;
699 	return (1);
700 }
701 
702 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)703 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
704                       vm_paddr_t *physmap, int *physmap_idx)
705 {
706 	struct bios_smap *smap, *smapend;
707 
708 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
709 
710 	for (smap = smapbase; smap < smapend; smap++) {
711 		if (boothowto & RB_VERBOSE)
712 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
713 			    smap->type, smap->base, smap->length);
714 
715 		if (smap->type != SMAP_TYPE_MEMORY)
716 			continue;
717 
718 		if (!add_physmap_entry(smap->base, smap->length, physmap,
719 		    physmap_idx))
720 			break;
721 	}
722 }
723 
724 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)725 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
726     int *physmap_idx)
727 {
728 	struct efi_md *map, *p;
729 	const char *type;
730 	size_t efisz;
731 	int ndesc, i;
732 
733 	static const char *types[] = {
734 		"Reserved",
735 		"LoaderCode",
736 		"LoaderData",
737 		"BootServicesCode",
738 		"BootServicesData",
739 		"RuntimeServicesCode",
740 		"RuntimeServicesData",
741 		"ConventionalMemory",
742 		"UnusableMemory",
743 		"ACPIReclaimMemory",
744 		"ACPIMemoryNVS",
745 		"MemoryMappedIO",
746 		"MemoryMappedIOPortSpace",
747 		"PalCode",
748 		"PersistentMemory"
749 	};
750 
751 	/*
752 	 * Memory map data provided by UEFI via the GetMemoryMap
753 	 * Boot Services API.
754 	 */
755 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
756 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
757 
758 	if (efihdr->descriptor_size == 0)
759 		return;
760 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
761 
762 	if (boothowto & RB_VERBOSE)
763 		printf("%23s %12s %12s %8s %4s\n",
764 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
765 
766 	TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
767 	for (i = 0, p = map; i < ndesc; i++,
768 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
769 		if (boothowto & RB_VERBOSE) {
770 			if (p->md_type < nitems(types))
771 				type = types[p->md_type];
772 			else
773 				type = "<INVALID>";
774 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
775 			    p->md_virt, p->md_pages);
776 			if (p->md_attr & EFI_MD_ATTR_UC)
777 				printf("UC ");
778 			if (p->md_attr & EFI_MD_ATTR_WC)
779 				printf("WC ");
780 			if (p->md_attr & EFI_MD_ATTR_WT)
781 				printf("WT ");
782 			if (p->md_attr & EFI_MD_ATTR_WB)
783 				printf("WB ");
784 			if (p->md_attr & EFI_MD_ATTR_UCE)
785 				printf("UCE ");
786 			if (p->md_attr & EFI_MD_ATTR_WP)
787 				printf("WP ");
788 			if (p->md_attr & EFI_MD_ATTR_RP)
789 				printf("RP ");
790 			if (p->md_attr & EFI_MD_ATTR_XP)
791 				printf("XP ");
792 			if (p->md_attr & EFI_MD_ATTR_NV)
793 				printf("NV ");
794 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
795 				printf("MORE_RELIABLE ");
796 			if (p->md_attr & EFI_MD_ATTR_RO)
797 				printf("RO ");
798 			if (p->md_attr & EFI_MD_ATTR_RT)
799 				printf("RUNTIME");
800 			printf("\n");
801 		}
802 
803 		switch (p->md_type) {
804 		case EFI_MD_TYPE_BS_CODE:
805 		case EFI_MD_TYPE_BS_DATA:
806 			if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
807 				continue;
808 			/* FALLTHROUGH */
809 		case EFI_MD_TYPE_CODE:
810 		case EFI_MD_TYPE_DATA:
811 		case EFI_MD_TYPE_FREE:
812 			/*
813 			 * We're allowed to use any entry with these types.
814 			 */
815 			break;
816 		default:
817 			continue;
818 		}
819 
820 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
821 		    physmap, physmap_idx))
822 			break;
823 	}
824 }
825 
826 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)827 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
828 {
829 	struct bios_smap *smap;
830 	struct efi_map_header *efihdr;
831 	u_int32_t size;
832 
833 	/*
834 	 * Memory map from INT 15:E820.
835 	 *
836 	 * subr_module.c says:
837 	 * "Consumer may safely assume that size value precedes data."
838 	 * ie: an int32_t immediately precedes smap.
839 	 */
840 
841 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
842 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
843 	smap = (struct bios_smap *)preload_search_info(preload_kmdp,
844 	    MODINFO_METADATA | MODINFOMD_SMAP);
845 	if (efihdr == NULL && smap == NULL)
846 		panic("No BIOS smap or EFI map info from loader!");
847 
848 	if (efihdr != NULL) {
849 		add_efi_map_entries(efihdr, physmap, physmap_idx);
850 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
851 	} else {
852 		size = *((u_int32_t *)smap - 1);
853 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
854 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
855 	}
856 }
857 
858 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
859 
860 /*
861  * Populate the (physmap) array with base/bound pairs describing the
862  * available physical memory in the system, then test this memory and
863  * build the phys_avail array describing the actually-available memory.
864  *
865  * Total memory size may be set by the kernel environment variable
866  * hw.physmem or the compile-time define MAXMEM.
867  *
868  * XXX first should be vm_paddr_t.
869  */
870 static void
getmemsize(u_int64_t first)871 getmemsize(u_int64_t first)
872 {
873 	int i, physmap_idx, pa_indx, da_indx;
874 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
875 	u_long physmem_start, physmem_tunable, memtest;
876 	pt_entry_t *pte;
877 	quad_t dcons_addr, dcons_size;
878 	int page_counter;
879 
880 	TSENTER();
881 	/*
882 	 * Tell the physical memory allocator about pages used to store
883 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
884 	 */
885 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
886 
887 	bzero(physmap, sizeof(physmap));
888 	physmap_idx = 0;
889 
890 	init_ops.parse_memmap(physmap, &physmap_idx);
891 	physmap_idx -= 2;
892 
893 	/*
894 	 * Find the 'base memory' segment for SMP
895 	 */
896 	basemem = 0;
897 	for (i = 0; i <= physmap_idx; i += 2) {
898 		if (physmap[i] <= 0xA0000) {
899 			basemem = physmap[i + 1] / 1024;
900 			break;
901 		}
902 	}
903 	if (basemem == 0 || basemem > 640) {
904 		if (bootverbose)
905 			printf(
906 		"Memory map doesn't contain a basemem segment, faking it");
907 		basemem = 640;
908 	}
909 
910 	/*
911 	 * Maxmem isn't the "maximum memory", it's one larger than the
912 	 * highest page of the physical address space.  It should be
913 	 * called something like "Maxphyspage".  We may adjust this
914 	 * based on ``hw.physmem'' and the results of the memory test.
915 	 */
916 	Maxmem = atop(physmap[physmap_idx + 1]);
917 
918 #ifdef MAXMEM
919 	Maxmem = MAXMEM / 4;
920 #endif
921 
922 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
923 		Maxmem = atop(physmem_tunable);
924 
925 	/*
926 	 * The boot memory test is disabled by default, as it takes a
927 	 * significant amount of time on large-memory systems, and is
928 	 * unfriendly to virtual machines as it unnecessarily touches all
929 	 * pages.
930 	 *
931 	 * A general name is used as the code may be extended to support
932 	 * additional tests beyond the current "page present" test.
933 	 */
934 	memtest = 0;
935 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
936 
937 	/*
938 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
939 	 * in the system.
940 	 */
941 	if (Maxmem > atop(physmap[physmap_idx + 1]))
942 		Maxmem = atop(physmap[physmap_idx + 1]);
943 
944 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
945 	    (boothowto & RB_VERBOSE))
946 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
947 
948 	/* call pmap initialization to make new kernel address space */
949 	pmap_bootstrap(&first);
950 
951 	/*
952 	 * Size up each available chunk of physical memory.
953 	 *
954 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
955 	 * By default, mask off the first 16 pages unless we appear to be
956 	 * running in a VM.
957 	 */
958 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
959 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
960 	if (physmap[0] < physmem_start) {
961 		if (physmem_start < PAGE_SIZE)
962 			physmap[0] = PAGE_SIZE;
963 		else if (physmem_start >= physmap[1])
964 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
965 		else
966 			physmap[0] = round_page(physmem_start);
967 	}
968 	pa_indx = 0;
969 	da_indx = 1;
970 	phys_avail[pa_indx++] = physmap[0];
971 	phys_avail[pa_indx] = physmap[0];
972 	dump_avail[da_indx] = physmap[0];
973 	pte = CMAP1;
974 
975 	/*
976 	 * Get dcons buffer address
977 	 */
978 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
979 	    getenv_quad("dcons.size", &dcons_size) == 0)
980 		dcons_addr = 0;
981 
982 	/*
983 	 * physmap is in bytes, so when converting to page boundaries,
984 	 * round up the start address and round down the end address.
985 	 */
986 	page_counter = 0;
987 	if (memtest != 0)
988 		printf("Testing system memory");
989 	for (i = 0; i <= physmap_idx; i += 2) {
990 		vm_paddr_t end;
991 
992 		end = ptoa((vm_paddr_t)Maxmem);
993 		if (physmap[i + 1] < end)
994 			end = trunc_page(physmap[i + 1]);
995 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
996 			int *ptr = (int *)CADDR1;
997 			int tmp;
998 			bool full, page_bad;
999 
1000 			full = false;
1001 			/*
1002 			 * block out kernel memory as not available.
1003 			 */
1004 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1005 				goto do_dump_avail;
1006 
1007 			/*
1008 			 * block out dcons buffer
1009 			 */
1010 			if (dcons_addr > 0
1011 			    && pa >= trunc_page(dcons_addr)
1012 			    && pa < dcons_addr + dcons_size)
1013 				goto do_dump_avail;
1014 
1015 			page_bad = false;
1016 			if (memtest == 0)
1017 				goto skip_memtest;
1018 
1019 			/*
1020 			 * Print a "." every GB to show we're making
1021 			 * progress.
1022 			 */
1023 			page_counter++;
1024 			if ((page_counter % PAGES_PER_GB) == 0)
1025 				printf(".");
1026 
1027 			/*
1028 			 * map page into kernel: valid, read/write,non-cacheable
1029 			 */
1030 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1031 			invltlb();
1032 
1033 			tmp = *(int *)ptr;
1034 			/*
1035 			 * Test for alternating 1's and 0's
1036 			 */
1037 			*(volatile int *)ptr = 0xaaaaaaaa;
1038 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1039 				page_bad = true;
1040 			/*
1041 			 * Test for alternating 0's and 1's
1042 			 */
1043 			*(volatile int *)ptr = 0x55555555;
1044 			if (*(volatile int *)ptr != 0x55555555)
1045 				page_bad = true;
1046 			/*
1047 			 * Test for all 1's
1048 			 */
1049 			*(volatile int *)ptr = 0xffffffff;
1050 			if (*(volatile int *)ptr != 0xffffffff)
1051 				page_bad = true;
1052 			/*
1053 			 * Test for all 0's
1054 			 */
1055 			*(volatile int *)ptr = 0x0;
1056 			if (*(volatile int *)ptr != 0x0)
1057 				page_bad = true;
1058 			/*
1059 			 * Restore original value.
1060 			 */
1061 			*(int *)ptr = tmp;
1062 
1063 skip_memtest:
1064 			/*
1065 			 * Adjust array of valid/good pages.
1066 			 */
1067 			if (page_bad == true)
1068 				continue;
1069 			/*
1070 			 * If this good page is a continuation of the
1071 			 * previous set of good pages, then just increase
1072 			 * the end pointer. Otherwise start a new chunk.
1073 			 * Note that "end" points one higher than end,
1074 			 * making the range >= start and < end.
1075 			 * If we're also doing a speculative memory
1076 			 * test and we at or past the end, bump up Maxmem
1077 			 * so that we keep going. The first bad page
1078 			 * will terminate the loop.
1079 			 */
1080 			if (phys_avail[pa_indx] == pa) {
1081 				phys_avail[pa_indx] += PAGE_SIZE;
1082 			} else {
1083 				pa_indx++;
1084 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1085 					printf(
1086 		"Too many holes in the physical address space, giving up\n");
1087 					pa_indx--;
1088 					full = true;
1089 					goto do_dump_avail;
1090 				}
1091 				phys_avail[pa_indx++] = pa;	/* start */
1092 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1093 			}
1094 			physmem++;
1095 do_dump_avail:
1096 			if (dump_avail[da_indx] == pa) {
1097 				dump_avail[da_indx] += PAGE_SIZE;
1098 			} else {
1099 				da_indx++;
1100 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1101 					da_indx--;
1102 					goto do_next;
1103 				}
1104 				dump_avail[da_indx++] = pa; /* start */
1105 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1106 			}
1107 do_next:
1108 			if (full)
1109 				break;
1110 		}
1111 	}
1112 	*pte = 0;
1113 	invltlb();
1114 	if (memtest != 0)
1115 		printf("\n");
1116 
1117 	/*
1118 	 * XXX
1119 	 * The last chunk must contain at least one page plus the message
1120 	 * buffer to avoid complicating other code (message buffer address
1121 	 * calculation, etc.).
1122 	 */
1123 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1124 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1125 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1126 		phys_avail[pa_indx--] = 0;
1127 		phys_avail[pa_indx--] = 0;
1128 	}
1129 
1130 	Maxmem = atop(phys_avail[pa_indx]);
1131 
1132 	/* Trim off space for the message buffer. */
1133 	phys_avail[pa_indx] -= round_page(msgbufsize);
1134 
1135 	/* Map the message buffer. */
1136 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1137 	TSEXIT();
1138 }
1139 
1140 static void
native_parse_preload_data(u_int64_t modulep)1141 native_parse_preload_data(u_int64_t modulep)
1142 {
1143 	char *envp;
1144 #ifdef DDB
1145 	vm_offset_t ksym_start;
1146 	vm_offset_t ksym_end;
1147 #endif
1148 
1149 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1150 	preload_bootstrap_relocate(KERNBASE);
1151 	preload_initkmdp(true);
1152 	boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1153 	envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1154 	if (envp != NULL)
1155 		envp += KERNBASE;
1156 	init_static_kenv(envp, 0);
1157 #ifdef DDB
1158 	ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1159 	ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1160 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1161 #endif
1162 	efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1163 	    vm_paddr_t);
1164 }
1165 
1166 static void
native_clock_source_init(void)1167 native_clock_source_init(void)
1168 {
1169 	i8254_init();
1170 }
1171 
1172 static void
amd64_kdb_init(void)1173 amd64_kdb_init(void)
1174 {
1175 	kdb_init();
1176 #ifdef KDB
1177 	if (boothowto & RB_KDB)
1178 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1179 #endif
1180 }
1181 
1182 /* Set up the fast syscall stuff */
1183 void
amd64_conf_fast_syscall(void)1184 amd64_conf_fast_syscall(void)
1185 {
1186 	uint64_t msr;
1187 
1188 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1189 	wrmsr(MSR_EFER, msr);
1190 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1191 	    (u_int64_t)IDTVEC(fast_syscall));
1192 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1193 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1194 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1195 	wrmsr(MSR_STAR, msr);
1196 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1197 }
1198 
1199 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1200 amd64_bsp_pcpu_init1(struct pcpu *pc)
1201 {
1202 	struct user_segment_descriptor *gdt;
1203 
1204 	PCPU_SET(prvspace, pc);
1205 	gdt = *PCPU_PTR(gdt);
1206 	PCPU_SET(curthread, &thread0);
1207 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1208 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1209 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1210 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1211 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1212 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1213 	PCPU_SET(smp_tlb_gen, 1);
1214 }
1215 
1216 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1217 amd64_bsp_pcpu_init2(uint64_t rsp0)
1218 {
1219 
1220 	PCPU_SET(rsp0, rsp0);
1221 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1222 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1223 	PCPU_SET(curpcb, thread0.td_pcb);
1224 }
1225 
1226 void
amd64_bsp_ist_init(struct pcpu * pc)1227 amd64_bsp_ist_init(struct pcpu *pc)
1228 {
1229 	struct nmi_pcpu *np;
1230 	struct amd64tss *tssp;
1231 
1232 	tssp = &pc->pc_common_tss;
1233 
1234 	/* doublefault stack space, runs on ist1 */
1235 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1236 	np->np_pcpu = (register_t)pc;
1237 	tssp->tss_ist1 = (long)np;
1238 
1239 	/*
1240 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1241 	 * above the start of the ist2 stack.
1242 	 */
1243 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1244 	np->np_pcpu = (register_t)pc;
1245 	tssp->tss_ist2 = (long)np;
1246 
1247 	/*
1248 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1249 	 * above the start of the ist3 stack.
1250 	 */
1251 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1252 	np->np_pcpu = (register_t)pc;
1253 	tssp->tss_ist3 = (long)np;
1254 
1255 	/*
1256 	 * DB# stack, runs on ist4.
1257 	 */
1258 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1259 	np->np_pcpu = (register_t)pc;
1260 	tssp->tss_ist4 = (long)np;
1261 }
1262 
1263 /*
1264  * Calculate the kernel load address by inspecting page table created by loader.
1265  * The assumptions:
1266  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1267  *   aligned at 2M, below 4G (the latter is important for AP startup)
1268  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1269  * - kernel is mapped with 2M superpages
1270  * - all participating memory, i.e. kernel, modules, metadata,
1271  *   page table is accessible by pre-created 1:1 mapping
1272  *   (right now loader creates 1:1 mapping for lower 4G, and all
1273  *   memory is from there)
1274  * - there is a usable memory block right after the end of the
1275  *   mapped kernel and all modules/metadata, pointed to by
1276  *   physfree, for early allocations
1277  */
1278 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1279 amd64_loadaddr(void)
1280 {
1281 	pml4_entry_t *pml4e;
1282 	pdp_entry_t *pdpe;
1283 	pd_entry_t *pde;
1284 	uint64_t cr3;
1285 
1286 	cr3 = rcr3();
1287 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1288 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1289 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1290 	return (*pde & PG_FRAME);
1291 }
1292 
1293 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1294 hammer_time(u_int64_t modulep, u_int64_t physfree)
1295 {
1296 	int gsel_tss, x;
1297 	struct pcpu *pc;
1298 	uint64_t rsp0;
1299 	char *env;
1300 	struct user_segment_descriptor *gdt;
1301 	struct region_descriptor r_gdt;
1302 	size_t kstack0_sz;
1303 
1304 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1305 
1306 	kernphys = amd64_loadaddr();
1307 
1308 	physfree += kernphys;
1309 
1310 	/* Initializes preload_kmdp */
1311 	init_ops.parse_preload_data(modulep);
1312 
1313 	efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1314 	    MODINFOMD_EFI_MAP) != NULL;
1315 
1316 	if (!efi_boot) {
1317 		/* Tell the bios to warmboot next time */
1318 		atomic_store_short((u_short *)0x472, 0x1234);
1319 	}
1320 
1321 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1322 	physfree = roundup2(physfree, PAGE_SIZE);
1323 
1324 	identify_cpu1();
1325 	identify_hypervisor();
1326 	identify_hypervisor_smbios();
1327 	identify_cpu_fixup_bsp();
1328 	identify_cpu2();
1329 	initializecpucache();
1330 
1331 	/*
1332 	 * Check for pti, pcid, and invpcid before ifuncs are
1333 	 * resolved, to correctly select the implementation for
1334 	 * pmap_activate_sw_mode().
1335 	 */
1336 	pti = pti_get_default();
1337 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1338 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1339 	if ((cpu_feature2 & CPUID2_PCID) == 0)
1340 		pmap_pcid_enabled = 0;
1341 	invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0;
1342 
1343 	/*
1344 	 * Now we can do small core initialization, after the PCID
1345 	 * CPU features and user knobs are evaluated.
1346 	 */
1347 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1348 	    &pmap_pcid_invlpg_workaround_uena);
1349 	cpu_init_small_core();
1350 
1351 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1352 		use_xsave = 1;
1353 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1354 	}
1355 
1356 	link_elf_ireloc();
1357 
1358 	/*
1359 	 * This may be done better later if it gets more high level
1360 	 * components in it. If so just link td->td_proc here.
1361 	 */
1362 	proc_linkup0(&proc0, &thread0);
1363 
1364 	/* Init basic tunables, hz etc */
1365 	init_param1();
1366 
1367 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1368 	thread0.td_kstack_pages = kstack_pages;
1369 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1370 	bzero((void *)thread0.td_kstack, kstack0_sz);
1371 	physfree += kstack0_sz;
1372 
1373 	/*
1374 	 * Initialize enough of thread0 for delayed invalidation to
1375 	 * work very early.  Rely on thread0.td_base_pri
1376 	 * zero-initialization, it is reset to PVM at proc0_init().
1377 	 */
1378 	pmap_thread_init_invl_gen(&thread0);
1379 
1380 	pc = &temp_bsp_pcpu;
1381 	pcpu_init(pc, 0, sizeof(struct pcpu));
1382 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1383 
1384 	/*
1385 	 * make gdt memory segments
1386 	 */
1387 	for (x = 0; x < NGDT; x++) {
1388 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1389 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1390 			ssdtosd(&gdt_segs[x], &gdt[x]);
1391 	}
1392 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1393 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1394 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1395 
1396 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1397 	r_gdt.rd_base = (long)gdt;
1398 	lgdt(&r_gdt);
1399 
1400 	wrmsr(MSR_FSBASE, 0);		/* User value */
1401 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1402 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1403 
1404 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1405 	physfree += DPCPU_SIZE;
1406 	amd64_bsp_pcpu_init1(pc);
1407 	/* Non-late cninit() and printf() can be moved up to here. */
1408 
1409 	/*
1410 	 * Initialize mutexes.
1411 	 *
1412 	 * icu_lock: in order to allow an interrupt to occur in a critical
1413 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1414 	 *	     must be able to get the icu lock, so it can't be
1415 	 *	     under witness.
1416 	 */
1417 	mutex_init();
1418 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1419 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1420 
1421 	/* exceptions */
1422 	for (x = 0; x < NIDT; x++)
1423 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1424 		    SEL_KPL, 0);
1425 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1426 	    SEL_KPL, 0);
1427 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1428 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1429 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1430 	    SEL_UPL, 0);
1431 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1432 	    SEL_UPL, 0);
1433 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1434 	    SEL_KPL, 0);
1435 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1436 	    SEL_KPL, 0);
1437 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1438 	    SEL_KPL, 0);
1439 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1440 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1441 	    SDT_SYSIGT, SEL_KPL, 0);
1442 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1443 	    SEL_KPL, 0);
1444 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1445 	    SDT_SYSIGT, SEL_KPL, 0);
1446 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1449 	    SEL_KPL, 0);
1450 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1453 	    SEL_KPL, 0);
1454 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1455 	    SEL_KPL, 0);
1456 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1457 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1458 	    SEL_KPL, 0);
1459 #ifdef KDTRACE_HOOKS
1460 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1461 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1462 #endif
1463 #ifdef XENHVM
1464 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1465 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1466 #endif
1467 	r_idt.rd_limit = sizeof(idt0) - 1;
1468 	r_idt.rd_base = (long) idt;
1469 	lidt(&r_idt);
1470 
1471 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1472 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1473 
1474 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1475 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1476 
1477 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1478 	    &syscall_ret_l1d_flush_mode);
1479 
1480 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1481 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1482 
1483 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1484 
1485 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1486 	    &x86_rngds_mitg_enable);
1487 
1488 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1489 	    &zenbleed_enable);
1490 	zenbleed_sanitize_enable();
1491 
1492 	finishidentcpu();	/* Final stage of CPU initialization */
1493 
1494 	invlpgb_works = (amd_extended_feature_extensions &
1495 	    AMDFEID_INVLPGB) != 0;
1496 	TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1497 	if (invlpgb_works)
1498 		invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1499 
1500 	/*
1501 	 * Initialize the clock before the console so that console
1502 	 * initialization can use DELAY().
1503 	 */
1504 	clock_init();
1505 
1506 	initializecpu();	/* Initialize CPU registers */
1507 
1508 	amd64_bsp_ist_init(pc);
1509 
1510 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1511 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1512 	    IOPERM_BITMAP_SIZE;
1513 
1514 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1515 	ltr(gsel_tss);
1516 
1517 	amd64_conf_fast_syscall();
1518 
1519 	/*
1520 	 * We initialize the PCB pointer early so that exception
1521 	 * handlers will work.  Also set up td_critnest to short-cut
1522 	 * the page fault handler.
1523 	 */
1524 	cpu_max_ext_state_size = sizeof(struct savefpu);
1525 	set_top_of_stack_td(&thread0);
1526 	thread0.td_pcb = get_pcb_td(&thread0);
1527 	thread0.td_critnest = 1;
1528 
1529 	/*
1530 	 * The console and kdb should be initialized even earlier than here,
1531 	 * but some console drivers don't work until after getmemsize().
1532 	 * Default to late console initialization to support these drivers.
1533 	 * This loses mainly printf()s in getmemsize() and early debugging.
1534 	 */
1535 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1536 	if (!late_console) {
1537 		cninit();
1538 		amd64_kdb_init();
1539 	}
1540 
1541 	getmemsize(physfree);
1542 	init_param2(physmem);
1543 
1544 	/* now running on new page tables, configured,and u/iom is accessible */
1545 
1546 #ifdef DEV_PCI
1547         /* This call might adjust phys_avail[]. */
1548         pci_early_quirks();
1549 #endif
1550 
1551 	if (late_console)
1552 		cninit();
1553 
1554 	/*
1555 	 * Dump the boot metadata. We have to wait for cninit() since console
1556 	 * output is required. If it's grossly incorrect the kernel will never
1557 	 * make it this far.
1558 	 */
1559 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1560 		preload_dump();
1561 
1562 #ifdef DEV_ISA
1563 #ifdef DEV_ATPIC
1564 	elcr_probe();
1565 	atpic_startup();
1566 #else
1567 	/* Reset and mask the atpics and leave them shut down. */
1568 	atpic_reset();
1569 
1570 	/*
1571 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1572 	 * interrupt handler.
1573 	 */
1574 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1575 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1576 #endif
1577 #else
1578 #error "have you forgotten the isa device?"
1579 #endif
1580 
1581 	if (late_console)
1582 		amd64_kdb_init();
1583 
1584 	msgbufinit(msgbufp, msgbufsize);
1585 	fpuinit();
1586 
1587 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1588 	rsp0 = thread0.td_md.md_stack_base;
1589 	/* Ensure the stack is aligned to 16 bytes */
1590 	rsp0 &= ~0xFul;
1591 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1592 	amd64_bsp_pcpu_init2(rsp0);
1593 
1594 	/* transfer to user mode */
1595 
1596 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1597 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1598 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1599 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1600 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1601 
1602 	load_ds(_udatasel);
1603 	load_es(_udatasel);
1604 	load_fs(_ufssel);
1605 
1606 	/* setup proc 0's pcb */
1607 	thread0.td_pcb->pcb_flags = 0;
1608 
1609         env = kern_getenv("kernelname");
1610 	if (env != NULL)
1611 		strlcpy(kernelname, env, sizeof(kernelname));
1612 
1613 	kcsan_cpu_init(0);
1614 
1615 #ifdef FDT
1616 	x86_init_fdt();
1617 #endif
1618 	thread0.td_critnest = 0;
1619 
1620 	kasan_init();
1621 	kmsan_init();
1622 
1623 	TSEXIT();
1624 
1625 	/* Location of kernel stack for locore */
1626 	return (thread0.td_md.md_stack_base);
1627 }
1628 
1629 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1630 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1631 {
1632 
1633 	pcpu->pc_acpi_id = 0xffffffff;
1634 }
1635 
1636 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1637 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1638 {
1639 	struct bios_smap *smapbase;
1640 	struct bios_smap_xattr smap;
1641 	uint32_t *smapattr;
1642 	int count, error, i;
1643 
1644 	/* Retrieve the system memory map from the loader. */
1645 	smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1646 	    MODINFO_METADATA | MODINFOMD_SMAP);
1647 	if (smapbase == NULL)
1648 		return (0);
1649 	smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1650 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1651 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1652 	error = 0;
1653 	for (i = 0; i < count; i++) {
1654 		smap.base = smapbase[i].base;
1655 		smap.length = smapbase[i].length;
1656 		smap.type = smapbase[i].type;
1657 		if (smapattr != NULL)
1658 			smap.xattr = smapattr[i];
1659 		else
1660 			smap.xattr = 0;
1661 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1662 	}
1663 	return (error);
1664 }
1665 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1666     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1667     smap_sysctl_handler, "S,bios_smap_xattr",
1668     "Raw BIOS SMAP data");
1669 
1670 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1671 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1672 {
1673 	struct efi_map_header *efihdr;
1674 	uint32_t efisize;
1675 
1676 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1677 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1678 	if (efihdr == NULL)
1679 		return (0);
1680 	efisize = *((uint32_t *)efihdr - 1);
1681 	return (SYSCTL_OUT(req, efihdr, efisize));
1682 }
1683 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1684     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1685     efi_map_sysctl_handler, "S,efi_map_header",
1686     "Raw EFI Memory Map");
1687 
1688 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1689 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1690 {
1691 	char *arch;
1692 
1693 	arch = (char *)preload_search_info(preload_kmdp,
1694 	    MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1695 	if (arch == NULL)
1696 		return (0);
1697 
1698 	return (SYSCTL_OUT_STR(req, arch));
1699 }
1700 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1701     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1702     efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1703 
1704 void
spinlock_enter(void)1705 spinlock_enter(void)
1706 {
1707 	struct thread *td;
1708 	register_t flags;
1709 
1710 	td = curthread;
1711 	if (td->td_md.md_spinlock_count == 0) {
1712 		flags = intr_disable();
1713 		td->td_md.md_spinlock_count = 1;
1714 		td->td_md.md_saved_flags = flags;
1715 		critical_enter();
1716 	} else
1717 		td->td_md.md_spinlock_count++;
1718 }
1719 
1720 void
spinlock_exit(void)1721 spinlock_exit(void)
1722 {
1723 	struct thread *td;
1724 	register_t flags;
1725 
1726 	td = curthread;
1727 	flags = td->td_md.md_saved_flags;
1728 	td->td_md.md_spinlock_count--;
1729 	if (td->td_md.md_spinlock_count == 0) {
1730 		critical_exit();
1731 		intr_restore(flags);
1732 	}
1733 }
1734 
1735 /*
1736  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1737  * we want to start a backtrace from the function that caused us to enter
1738  * the debugger. We have the context in the trapframe, but base the trace
1739  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1740  * enough for a backtrace.
1741  */
1742 void
makectx(struct trapframe * tf,struct pcb * pcb)1743 makectx(struct trapframe *tf, struct pcb *pcb)
1744 {
1745 
1746 	pcb->pcb_r12 = tf->tf_r12;
1747 	pcb->pcb_r13 = tf->tf_r13;
1748 	pcb->pcb_r14 = tf->tf_r14;
1749 	pcb->pcb_r15 = tf->tf_r15;
1750 	pcb->pcb_rbp = tf->tf_rbp;
1751 	pcb->pcb_rbx = tf->tf_rbx;
1752 	pcb->pcb_rip = tf->tf_rip;
1753 	pcb->pcb_rsp = tf->tf_rsp;
1754 }
1755 
1756 /*
1757  * The pcb_flags is only modified by current thread, or by other threads
1758  * when current thread is stopped.  However, current thread may change it
1759  * from the interrupt context in cpu_switch(), or in the trap handler.
1760  * When we read-modify-write pcb_flags from C sources, compiler may generate
1761  * code that is not atomic regarding the interrupt handler.  If a trap or
1762  * interrupt happens and any flag is modified from the handler, it can be
1763  * clobbered with the cached value later.  Therefore, we implement setting
1764  * and clearing flags with single-instruction functions, which do not race
1765  * with possible modification of the flags from the trap or interrupt context,
1766  * because traps and interrupts are executed only on instruction boundary.
1767  */
1768 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1769 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1770 {
1771 
1772 	__asm __volatile("orl %1,%0"
1773 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1774 	    : "cc", "memory");
1775 
1776 }
1777 
1778 /*
1779  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1780  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1781  * pcb if user space modified the bases.  We must save on the context
1782  * switch or if the return to usermode happens through the doreti.
1783  *
1784  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1785  * which have a consequence that the base MSRs must be saved each time
1786  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1787  * context switches.
1788  */
1789 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1790 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1791 {
1792 	register_t r;
1793 
1794 	if (curpcb == pcb &&
1795 	    (flags & PCB_FULL_IRET) != 0 &&
1796 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1797 		r = intr_disable();
1798 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1799 			pcb->pcb_fsbase = rdfsbase();
1800 			pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1801 		}
1802 		set_pcb_flags_raw(pcb, flags);
1803 		intr_restore(r);
1804 	} else {
1805 		set_pcb_flags_raw(pcb, flags);
1806 	}
1807 }
1808 
1809 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1810 {
1811 
1812 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1813 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1814 }
1815 
1816 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1817 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1818 {
1819 
1820 	__asm __volatile("andl %1,%0"
1821 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1822 	    : "cc", "memory");
1823 }
1824 
1825 #ifdef KDB
1826 
1827 /*
1828  * Provide inb() and outb() as functions.  They are normally only available as
1829  * inline functions, thus cannot be called from the debugger.
1830  */
1831 
1832 /* silence compiler warnings */
1833 u_char inb_(u_short);
1834 void outb_(u_short, u_char);
1835 
1836 u_char
inb_(u_short port)1837 inb_(u_short port)
1838 {
1839 	return inb(port);
1840 }
1841 
1842 void
outb_(u_short port,u_char data)1843 outb_(u_short port, u_char data)
1844 {
1845 	outb(port, data);
1846 }
1847 
1848 #endif /* KDB */
1849 
1850 #undef memset
1851 #undef memmove
1852 #undef memcpy
1853 
1854 void	*memset_std(void *buf, int c, size_t len);
1855 void	*memset_erms(void *buf, int c, size_t len);
1856 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1857 	    size_t len);
1858 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1859 	    size_t len);
1860 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1861 	    size_t len);
1862 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1863 	    size_t len);
1864 
1865 #ifdef KCSAN
1866 /*
1867  * These fail to build as ifuncs when used with KCSAN.
1868  */
1869 void *
memset(void * buf,int c,size_t len)1870 memset(void *buf, int c, size_t len)
1871 {
1872 
1873 	return (memset_std(buf, c, len));
1874 }
1875 
1876 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1877 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1878 {
1879 
1880 	return (memmove_std(dst, src, len));
1881 }
1882 
1883 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1884 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1885 {
1886 
1887 	return (memcpy_std(dst, src, len));
1888 }
1889 #else
1890 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1891 {
1892 
1893 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1894 	    memset_erms : memset_std);
1895 }
1896 
1897 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1898     size_t))
1899 {
1900 
1901 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1902 	    memmove_erms : memmove_std);
1903 }
1904 
1905 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1906 {
1907 
1908 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1909 	    memcpy_erms : memcpy_std);
1910 }
1911 #endif
1912 
1913 void	pagezero_std(void *addr);
1914 void	pagezero_erms(void *addr);
1915 DEFINE_IFUNC(, void , pagezero, (void *))
1916 {
1917 
1918 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1919 	    pagezero_erms : pagezero_std);
1920 }
1921