xref: /freebsd/sys/amd64/amd64/machdep.c (revision f3754afd5901857787271e73f9c34d3b9069a03f)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52 
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <dev/smbios/smbios.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167 
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170 
171 /* Preload data parse function */
172 static caddr_t native_parse_preload_data(u_int64_t);
173 
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
176 
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 	.parse_preload_data =		native_parse_preload_data,
180 	.early_clock_source_init =	native_clock_source_init,
181 	.early_delay =			i8254_delay,
182 	.parse_memmap =			native_parse_memmap,
183 };
184 
185 /*
186  * Physical address of the EFI System Table. Stashed from the metadata hints
187  * passed into the kernel and used by the EFI code to call runtime services.
188  */
189 vm_paddr_t efi_systbl_phys;
190 
191 /* Intel ICH registers */
192 #define ICH_PMBASE	0x400
193 #define ICH_SMI_EN	ICH_PMBASE + 0x30
194 
195 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
196 
197 int cold = 1;
198 
199 long Maxmem = 0;
200 long realmem = 0;
201 int late_console = 1;
202 
203 struct kva_md_info kmi;
204 
205 struct region_descriptor r_idt;
206 
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209 
210 struct mtx icu_lock;
211 
212 struct mem_range_softc mem_range_softc;
213 
214 struct mtx dt_lock;	/* lock for GDT and LDT */
215 
216 void (*vmm_resume_p)(void);
217 
218 bool efi_boot;
219 
220 static void
cpu_startup(void * dummy)221 cpu_startup(void *dummy)
222 {
223 	uintmax_t memsize;
224 	char *sysenv;
225 
226 	/*
227 	 * On MacBooks, we need to disallow the legacy USB circuit to
228 	 * generate an SMI# because this can cause several problems,
229 	 * namely: incorrect CPU frequency detection and failure to
230 	 * start the APs.
231 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
232 	 * Enable register) of the Intel ICH LPC Interface Bridge.
233 	 */
234 	sysenv = kern_getenv("smbios.system.product");
235 	if (sysenv != NULL) {
236 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
237 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
238 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
240 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
241 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
243 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
244 			if (bootverbose)
245 				printf("Disabling LEGACY_USB_EN bit on "
246 				    "Intel ICH.\n");
247 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
248 		}
249 		freeenv(sysenv);
250 	}
251 
252 	/*
253 	 * Good {morning,afternoon,evening,night}.
254 	 */
255 	startrtclock();
256 	printcpuinfo();
257 
258 	/*
259 	 * Display physical memory if SMBIOS reports reasonable amount.
260 	 */
261 	memsize = 0;
262 	sysenv = kern_getenv("smbios.memory.enabled");
263 	if (sysenv != NULL) {
264 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
265 		freeenv(sysenv);
266 	}
267 	if (memsize < ptoa((uintmax_t)vm_free_count()))
268 		memsize = ptoa((uintmax_t)Maxmem);
269 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
270 	realmem = atop(memsize);
271 
272 	/*
273 	 * Display any holes after the first chunk of extended memory.
274 	 */
275 	if (bootverbose) {
276 		int indx;
277 
278 		printf("Physical memory chunk(s):\n");
279 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
280 			vm_paddr_t size;
281 
282 			size = phys_avail[indx + 1] - phys_avail[indx];
283 			printf(
284 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
285 			    (uintmax_t)phys_avail[indx],
286 			    (uintmax_t)phys_avail[indx + 1] - 1,
287 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
288 		}
289 	}
290 
291 	vm_ksubmap_init(&kmi);
292 
293 	printf("avail memory = %ju (%ju MB)\n",
294 	    ptoa((uintmax_t)vm_free_count()),
295 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
296 #ifdef DEV_PCI
297 	if (bootverbose && intel_graphics_stolen_base != 0)
298 		printf("intel stolen mem: base %#jx size %ju MB\n",
299 		    (uintmax_t)intel_graphics_stolen_base,
300 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
301 #endif
302 
303 	/*
304 	 * Set up buffers, so they can be used to read disk labels.
305 	 */
306 	bufinit();
307 	vm_pager_bufferinit();
308 
309 	cpu_setregs();
310 }
311 
312 static void
late_ifunc_resolve(void * dummy __unused)313 late_ifunc_resolve(void *dummy __unused)
314 {
315 	link_elf_late_ireloc();
316 }
317 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
318 
319 
320 void
cpu_setregs(void)321 cpu_setregs(void)
322 {
323 	register_t cr0;
324 
325 	TSENTER();
326 	cr0 = rcr0();
327 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
328 	TSENTER2("load_cr0");
329 	load_cr0(cr0);
330 	TSEXIT2("load_cr0");
331 	TSEXIT();
332 }
333 
334 /*
335  * Initialize amd64 and configure to run kernel
336  */
337 
338 /*
339  * Initialize segments & interrupt table
340  */
341 static struct gate_descriptor idt0[NIDT];
342 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
343 
344 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
345 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
346 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
347 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
348 CTASSERT(sizeof(struct nmi_pcpu) == 16);
349 
350 /*
351  * Software prototypes -- in more palatable form.
352  *
353  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
354  * slots as corresponding segments for i386 kernel.
355  */
356 struct soft_segment_descriptor gdt_segs[] = {
357 [GNULL_SEL] = { /* 0 Null Descriptor */
358 	.ssd_base = 0x0,
359 	.ssd_limit = 0x0,
360 	.ssd_type = 0,
361 	.ssd_dpl = 0,
362 	.ssd_p = 0,
363 	.ssd_long = 0,
364 	.ssd_def32 = 0,
365 	.ssd_gran = 0		},
366 [GNULL2_SEL] = { /*	1 Null Descriptor */
367 	.ssd_base = 0x0,
368 	.ssd_limit = 0x0,
369 	.ssd_type = 0,
370 	.ssd_dpl = 0,
371 	.ssd_p = 0,
372 	.ssd_long = 0,
373 	.ssd_def32 = 0,
374 	.ssd_gran = 0		},
375 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
376 	.ssd_base = 0x0,
377 	.ssd_limit = 0xfffff,
378 	.ssd_type = SDT_MEMRWA,
379 	.ssd_dpl = SEL_UPL,
380 	.ssd_p = 1,
381 	.ssd_long = 0,
382 	.ssd_def32 = 1,
383 	.ssd_gran = 1		},
384 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
385 	.ssd_base = 0x0,
386 	.ssd_limit = 0xfffff,
387 	.ssd_type = SDT_MEMRWA,
388 	.ssd_dpl = SEL_UPL,
389 	.ssd_p = 1,
390 	.ssd_long = 0,
391 	.ssd_def32 = 1,
392 	.ssd_gran = 1		},
393 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
394 	.ssd_base = 0x0,
395 	.ssd_limit = 0xfffff,
396 	.ssd_type = SDT_MEMERA,
397 	.ssd_dpl = SEL_KPL,
398 	.ssd_p = 1,
399 	.ssd_long = 1,
400 	.ssd_def32 = 0,
401 	.ssd_gran = 1		},
402 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
403 	.ssd_base = 0x0,
404 	.ssd_limit = 0xfffff,
405 	.ssd_type = SDT_MEMRWA,
406 	.ssd_dpl = SEL_KPL,
407 	.ssd_p = 1,
408 	.ssd_long = 1,
409 	.ssd_def32 = 0,
410 	.ssd_gran = 1		},
411 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
412 	.ssd_base = 0x0,
413 	.ssd_limit = 0xfffff,
414 	.ssd_type = SDT_MEMERA,
415 	.ssd_dpl = SEL_UPL,
416 	.ssd_p = 1,
417 	.ssd_long = 0,
418 	.ssd_def32 = 1,
419 	.ssd_gran = 1		},
420 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
421 	.ssd_base = 0x0,
422 	.ssd_limit = 0xfffff,
423 	.ssd_type = SDT_MEMRWA,
424 	.ssd_dpl = SEL_UPL,
425 	.ssd_p = 1,
426 	.ssd_long = 0,
427 	.ssd_def32 = 1,
428 	.ssd_gran = 1		},
429 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
430 	.ssd_base = 0x0,
431 	.ssd_limit = 0xfffff,
432 	.ssd_type = SDT_MEMERA,
433 	.ssd_dpl = SEL_UPL,
434 	.ssd_p = 1,
435 	.ssd_long = 1,
436 	.ssd_def32 = 0,
437 	.ssd_gran = 1		},
438 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
439 	.ssd_base = 0x0,
440 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
441 	.ssd_type = SDT_SYSTSS,
442 	.ssd_dpl = SEL_KPL,
443 	.ssd_p = 1,
444 	.ssd_long = 0,
445 	.ssd_def32 = 0,
446 	.ssd_gran = 0		},
447 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
448 	.ssd_base = 0x0,
449 	.ssd_limit = 0x0,
450 	.ssd_type = 0,
451 	.ssd_dpl = 0,
452 	.ssd_p = 0,
453 	.ssd_long = 0,
454 	.ssd_def32 = 0,
455 	.ssd_gran = 0		},
456 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
457 	.ssd_base = 0x0,
458 	.ssd_limit = 0x0,
459 	.ssd_type = 0,
460 	.ssd_dpl = 0,
461 	.ssd_p = 0,
462 	.ssd_long = 0,
463 	.ssd_def32 = 0,
464 	.ssd_gran = 0		},
465 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
466 	.ssd_base = 0x0,
467 	.ssd_limit = 0x0,
468 	.ssd_type = 0,
469 	.ssd_dpl = 0,
470 	.ssd_p = 0,
471 	.ssd_long = 0,
472 	.ssd_def32 = 0,
473 	.ssd_gran = 0		},
474 };
475 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
476 
477 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)478 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
479 {
480 	struct gate_descriptor *ip;
481 
482 	ip = idt + idx;
483 	ip->gd_looffset = (uintptr_t)func;
484 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
485 	ip->gd_ist = ist;
486 	ip->gd_xx = 0;
487 	ip->gd_type = typ;
488 	ip->gd_dpl = dpl;
489 	ip->gd_p = 1;
490 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
491 }
492 
493 extern inthand_t
494 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
495 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
496 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
497 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
498 	IDTVEC(xmm), IDTVEC(dblfault),
499 	IDTVEC(div_pti), IDTVEC(bpt_pti),
500 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
501 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
502 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
503 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
504 	IDTVEC(xmm_pti),
505 #ifdef KDTRACE_HOOKS
506 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
507 #endif
508 #ifdef XENHVM
509 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
510 #endif
511 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
512 	IDTVEC(fast_syscall_pti);
513 
514 #ifdef DDB
515 /*
516  * Display the index and function name of any IDT entries that don't use
517  * the default 'rsvd' entry point.
518  */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)519 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
520 {
521 	struct gate_descriptor *ip;
522 	int idx;
523 	uintptr_t func;
524 
525 	ip = idt;
526 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
527 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
528 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
529 			db_printf("%3d\t", idx);
530 			db_printsym(func, DB_STGY_PROC);
531 			db_printf("\n");
532 		}
533 		ip++;
534 	}
535 }
536 
537 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)538 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
539 {
540 	struct {
541 		uint16_t limit;
542 		uint64_t base;
543 	} __packed idtr, gdtr;
544 	uint16_t ldt, tr;
545 
546 	__asm __volatile("sidt %0" : "=m" (idtr));
547 	db_printf("idtr\t0x%016lx/%04x\n",
548 	    (u_long)idtr.base, (u_int)idtr.limit);
549 	__asm __volatile("sgdt %0" : "=m" (gdtr));
550 	db_printf("gdtr\t0x%016lx/%04x\n",
551 	    (u_long)gdtr.base, (u_int)gdtr.limit);
552 	__asm __volatile("sldt %0" : "=r" (ldt));
553 	db_printf("ldtr\t0x%04x\n", ldt);
554 	__asm __volatile("str %0" : "=r" (tr));
555 	db_printf("tr\t0x%04x\n", tr);
556 	db_printf("cr0\t0x%016lx\n", rcr0());
557 	db_printf("cr2\t0x%016lx\n", rcr2());
558 	db_printf("cr3\t0x%016lx\n", rcr3());
559 	db_printf("cr4\t0x%016lx\n", rcr4());
560 	if (rcr4() & CR4_XSAVE)
561 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
562 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
563 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
564 		db_printf("FEATURES_CTL\t%016lx\n",
565 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
566 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
567 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
568 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
569 }
570 
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)571 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
572 {
573 
574 	db_printf("dr0\t0x%016lx\n", rdr0());
575 	db_printf("dr1\t0x%016lx\n", rdr1());
576 	db_printf("dr2\t0x%016lx\n", rdr2());
577 	db_printf("dr3\t0x%016lx\n", rdr3());
578 	db_printf("dr6\t0x%016lx\n", rdr6());
579 	db_printf("dr7\t0x%016lx\n", rdr7());
580 }
581 #endif
582 
583 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)584 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
585 {
586 
587 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
588 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
589 	ssd->ssd_type  = sd->sd_type;
590 	ssd->ssd_dpl   = sd->sd_dpl;
591 	ssd->ssd_p     = sd->sd_p;
592 	ssd->ssd_long  = sd->sd_long;
593 	ssd->ssd_def32 = sd->sd_def32;
594 	ssd->ssd_gran  = sd->sd_gran;
595 }
596 
597 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)598 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
599 {
600 
601 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
602 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
603 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
604 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
605 	sd->sd_type  = ssd->ssd_type;
606 	sd->sd_dpl   = ssd->ssd_dpl;
607 	sd->sd_p     = ssd->ssd_p;
608 	sd->sd_long  = ssd->ssd_long;
609 	sd->sd_def32 = ssd->ssd_def32;
610 	sd->sd_gran  = ssd->ssd_gran;
611 }
612 
613 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)614 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
615 {
616 
617 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
618 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
619 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
620 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
621 	sd->sd_type  = ssd->ssd_type;
622 	sd->sd_dpl   = ssd->ssd_dpl;
623 	sd->sd_p     = ssd->ssd_p;
624 	sd->sd_gran  = ssd->ssd_gran;
625 }
626 
627 u_int basemem;
628 
629 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)630 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
631     int *physmap_idxp)
632 {
633 	int i, insert_idx, physmap_idx;
634 
635 	physmap_idx = *physmap_idxp;
636 
637 	if (length == 0)
638 		return (1);
639 
640 	/*
641 	 * Find insertion point while checking for overlap.  Start off by
642 	 * assuming the new entry will be added to the end.
643 	 *
644 	 * NB: physmap_idx points to the next free slot.
645 	 */
646 	insert_idx = physmap_idx;
647 	for (i = 0; i <= physmap_idx; i += 2) {
648 		if (base < physmap[i + 1]) {
649 			if (base + length <= physmap[i]) {
650 				insert_idx = i;
651 				break;
652 			}
653 			if (boothowto & RB_VERBOSE)
654 				printf(
655 		    "Overlapping memory regions, ignoring second region\n");
656 			return (1);
657 		}
658 	}
659 
660 	/* See if we can prepend to the next entry. */
661 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
662 		physmap[insert_idx] = base;
663 		return (1);
664 	}
665 
666 	/* See if we can append to the previous entry. */
667 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
668 		physmap[insert_idx - 1] += length;
669 		return (1);
670 	}
671 
672 	physmap_idx += 2;
673 	*physmap_idxp = physmap_idx;
674 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
675 		printf(
676 		"Too many segments in the physical address map, giving up\n");
677 		return (0);
678 	}
679 
680 	/*
681 	 * Move the last 'N' entries down to make room for the new
682 	 * entry if needed.
683 	 */
684 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
685 		physmap[i] = physmap[i - 2];
686 		physmap[i + 1] = physmap[i - 1];
687 	}
688 
689 	/* Insert the new entry. */
690 	physmap[insert_idx] = base;
691 	physmap[insert_idx + 1] = base + length;
692 	return (1);
693 }
694 
695 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)696 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
697                       vm_paddr_t *physmap, int *physmap_idx)
698 {
699 	struct bios_smap *smap, *smapend;
700 
701 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
702 
703 	for (smap = smapbase; smap < smapend; smap++) {
704 		if (boothowto & RB_VERBOSE)
705 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
706 			    smap->type, smap->base, smap->length);
707 
708 		if (smap->type != SMAP_TYPE_MEMORY)
709 			continue;
710 
711 		if (!add_physmap_entry(smap->base, smap->length, physmap,
712 		    physmap_idx))
713 			break;
714 	}
715 }
716 
717 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)718 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
719     int *physmap_idx)
720 {
721 	struct efi_md *map, *p;
722 	const char *type;
723 	size_t efisz;
724 	int ndesc, i;
725 
726 	static const char *types[] = {
727 		"Reserved",
728 		"LoaderCode",
729 		"LoaderData",
730 		"BootServicesCode",
731 		"BootServicesData",
732 		"RuntimeServicesCode",
733 		"RuntimeServicesData",
734 		"ConventionalMemory",
735 		"UnusableMemory",
736 		"ACPIReclaimMemory",
737 		"ACPIMemoryNVS",
738 		"MemoryMappedIO",
739 		"MemoryMappedIOPortSpace",
740 		"PalCode",
741 		"PersistentMemory"
742 	};
743 
744 	/*
745 	 * Memory map data provided by UEFI via the GetMemoryMap
746 	 * Boot Services API.
747 	 */
748 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
749 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
750 
751 	if (efihdr->descriptor_size == 0)
752 		return;
753 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
754 
755 	if (boothowto & RB_VERBOSE)
756 		printf("%23s %12s %12s %8s %4s\n",
757 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
758 
759 	for (i = 0, p = map; i < ndesc; i++,
760 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
761 		if (boothowto & RB_VERBOSE) {
762 			if (p->md_type < nitems(types))
763 				type = types[p->md_type];
764 			else
765 				type = "<INVALID>";
766 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
767 			    p->md_virt, p->md_pages);
768 			if (p->md_attr & EFI_MD_ATTR_UC)
769 				printf("UC ");
770 			if (p->md_attr & EFI_MD_ATTR_WC)
771 				printf("WC ");
772 			if (p->md_attr & EFI_MD_ATTR_WT)
773 				printf("WT ");
774 			if (p->md_attr & EFI_MD_ATTR_WB)
775 				printf("WB ");
776 			if (p->md_attr & EFI_MD_ATTR_UCE)
777 				printf("UCE ");
778 			if (p->md_attr & EFI_MD_ATTR_WP)
779 				printf("WP ");
780 			if (p->md_attr & EFI_MD_ATTR_RP)
781 				printf("RP ");
782 			if (p->md_attr & EFI_MD_ATTR_XP)
783 				printf("XP ");
784 			if (p->md_attr & EFI_MD_ATTR_NV)
785 				printf("NV ");
786 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
787 				printf("MORE_RELIABLE ");
788 			if (p->md_attr & EFI_MD_ATTR_RO)
789 				printf("RO ");
790 			if (p->md_attr & EFI_MD_ATTR_RT)
791 				printf("RUNTIME");
792 			printf("\n");
793 		}
794 
795 		switch (p->md_type) {
796 		case EFI_MD_TYPE_CODE:
797 		case EFI_MD_TYPE_DATA:
798 		case EFI_MD_TYPE_BS_CODE:
799 		case EFI_MD_TYPE_BS_DATA:
800 		case EFI_MD_TYPE_FREE:
801 			/*
802 			 * We're allowed to use any entry with these types.
803 			 */
804 			break;
805 		default:
806 			continue;
807 		}
808 
809 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
810 		    physmap, physmap_idx))
811 			break;
812 	}
813 }
814 
815 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)816 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
817 {
818 	struct bios_smap *smap;
819 	struct efi_map_header *efihdr;
820 	u_int32_t size;
821 
822 	/*
823 	 * Memory map from INT 15:E820.
824 	 *
825 	 * subr_module.c says:
826 	 * "Consumer may safely assume that size value precedes data."
827 	 * ie: an int32_t immediately precedes smap.
828 	 */
829 
830 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
831 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
832 	smap = (struct bios_smap *)preload_search_info(kmdp,
833 	    MODINFO_METADATA | MODINFOMD_SMAP);
834 	if (efihdr == NULL && smap == NULL)
835 		panic("No BIOS smap or EFI map info from loader!");
836 
837 	if (efihdr != NULL) {
838 		add_efi_map_entries(efihdr, physmap, physmap_idx);
839 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
840 	} else {
841 		size = *((u_int32_t *)smap - 1);
842 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
843 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
844 	}
845 }
846 
847 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
848 
849 /*
850  * Populate the (physmap) array with base/bound pairs describing the
851  * available physical memory in the system, then test this memory and
852  * build the phys_avail array describing the actually-available memory.
853  *
854  * Total memory size may be set by the kernel environment variable
855  * hw.physmem or the compile-time define MAXMEM.
856  *
857  * XXX first should be vm_paddr_t.
858  */
859 static void
getmemsize(caddr_t kmdp,u_int64_t first)860 getmemsize(caddr_t kmdp, u_int64_t first)
861 {
862 	int i, physmap_idx, pa_indx, da_indx;
863 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
864 	u_long physmem_start, physmem_tunable, memtest;
865 	pt_entry_t *pte;
866 	quad_t dcons_addr, dcons_size;
867 	int page_counter;
868 
869 	TSENTER();
870 	/*
871 	 * Tell the physical memory allocator about pages used to store
872 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
873 	 */
874 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
875 
876 	bzero(physmap, sizeof(physmap));
877 	physmap_idx = 0;
878 
879 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
880 	physmap_idx -= 2;
881 
882 	/*
883 	 * Find the 'base memory' segment for SMP
884 	 */
885 	basemem = 0;
886 	for (i = 0; i <= physmap_idx; i += 2) {
887 		if (physmap[i] <= 0xA0000) {
888 			basemem = physmap[i + 1] / 1024;
889 			break;
890 		}
891 	}
892 	if (basemem == 0 || basemem > 640) {
893 		if (bootverbose)
894 			printf(
895 		"Memory map doesn't contain a basemem segment, faking it");
896 		basemem = 640;
897 	}
898 
899 	/*
900 	 * Maxmem isn't the "maximum memory", it's one larger than the
901 	 * highest page of the physical address space.  It should be
902 	 * called something like "Maxphyspage".  We may adjust this
903 	 * based on ``hw.physmem'' and the results of the memory test.
904 	 */
905 	Maxmem = atop(physmap[physmap_idx + 1]);
906 
907 #ifdef MAXMEM
908 	Maxmem = MAXMEM / 4;
909 #endif
910 
911 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
912 		Maxmem = atop(physmem_tunable);
913 
914 	/*
915 	 * The boot memory test is disabled by default, as it takes a
916 	 * significant amount of time on large-memory systems, and is
917 	 * unfriendly to virtual machines as it unnecessarily touches all
918 	 * pages.
919 	 *
920 	 * A general name is used as the code may be extended to support
921 	 * additional tests beyond the current "page present" test.
922 	 */
923 	memtest = 0;
924 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
925 
926 	/*
927 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
928 	 * in the system.
929 	 */
930 	if (Maxmem > atop(physmap[physmap_idx + 1]))
931 		Maxmem = atop(physmap[physmap_idx + 1]);
932 
933 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
934 	    (boothowto & RB_VERBOSE))
935 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
936 
937 	/* call pmap initialization to make new kernel address space */
938 	pmap_bootstrap(&first);
939 
940 	/*
941 	 * Size up each available chunk of physical memory.
942 	 *
943 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
944 	 * By default, mask off the first 16 pages unless we appear to be
945 	 * running in a VM.
946 	 */
947 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
948 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
949 	if (physmap[0] < physmem_start) {
950 		if (physmem_start < PAGE_SIZE)
951 			physmap[0] = PAGE_SIZE;
952 		else if (physmem_start >= physmap[1])
953 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
954 		else
955 			physmap[0] = round_page(physmem_start);
956 	}
957 	pa_indx = 0;
958 	da_indx = 1;
959 	phys_avail[pa_indx++] = physmap[0];
960 	phys_avail[pa_indx] = physmap[0];
961 	dump_avail[da_indx] = physmap[0];
962 	pte = CMAP1;
963 
964 	/*
965 	 * Get dcons buffer address
966 	 */
967 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
968 	    getenv_quad("dcons.size", &dcons_size) == 0)
969 		dcons_addr = 0;
970 
971 	/*
972 	 * physmap is in bytes, so when converting to page boundaries,
973 	 * round up the start address and round down the end address.
974 	 */
975 	page_counter = 0;
976 	if (memtest != 0)
977 		printf("Testing system memory");
978 	for (i = 0; i <= physmap_idx; i += 2) {
979 		vm_paddr_t end;
980 
981 		end = ptoa((vm_paddr_t)Maxmem);
982 		if (physmap[i + 1] < end)
983 			end = trunc_page(physmap[i + 1]);
984 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
985 			int *ptr = (int *)CADDR1;
986 			int tmp;
987 			bool full, page_bad;
988 
989 			full = false;
990 			/*
991 			 * block out kernel memory as not available.
992 			 */
993 			if (pa >= (vm_paddr_t)kernphys && pa < first)
994 				goto do_dump_avail;
995 
996 			/*
997 			 * block out dcons buffer
998 			 */
999 			if (dcons_addr > 0
1000 			    && pa >= trunc_page(dcons_addr)
1001 			    && pa < dcons_addr + dcons_size)
1002 				goto do_dump_avail;
1003 
1004 			page_bad = false;
1005 			if (memtest == 0)
1006 				goto skip_memtest;
1007 
1008 			/*
1009 			 * Print a "." every GB to show we're making
1010 			 * progress.
1011 			 */
1012 			page_counter++;
1013 			if ((page_counter % PAGES_PER_GB) == 0)
1014 				printf(".");
1015 
1016 			/*
1017 			 * map page into kernel: valid, read/write,non-cacheable
1018 			 */
1019 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1020 			invltlb();
1021 
1022 			tmp = *(int *)ptr;
1023 			/*
1024 			 * Test for alternating 1's and 0's
1025 			 */
1026 			*(volatile int *)ptr = 0xaaaaaaaa;
1027 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1028 				page_bad = true;
1029 			/*
1030 			 * Test for alternating 0's and 1's
1031 			 */
1032 			*(volatile int *)ptr = 0x55555555;
1033 			if (*(volatile int *)ptr != 0x55555555)
1034 				page_bad = true;
1035 			/*
1036 			 * Test for all 1's
1037 			 */
1038 			*(volatile int *)ptr = 0xffffffff;
1039 			if (*(volatile int *)ptr != 0xffffffff)
1040 				page_bad = true;
1041 			/*
1042 			 * Test for all 0's
1043 			 */
1044 			*(volatile int *)ptr = 0x0;
1045 			if (*(volatile int *)ptr != 0x0)
1046 				page_bad = true;
1047 			/*
1048 			 * Restore original value.
1049 			 */
1050 			*(int *)ptr = tmp;
1051 
1052 skip_memtest:
1053 			/*
1054 			 * Adjust array of valid/good pages.
1055 			 */
1056 			if (page_bad == true)
1057 				continue;
1058 			/*
1059 			 * If this good page is a continuation of the
1060 			 * previous set of good pages, then just increase
1061 			 * the end pointer. Otherwise start a new chunk.
1062 			 * Note that "end" points one higher than end,
1063 			 * making the range >= start and < end.
1064 			 * If we're also doing a speculative memory
1065 			 * test and we at or past the end, bump up Maxmem
1066 			 * so that we keep going. The first bad page
1067 			 * will terminate the loop.
1068 			 */
1069 			if (phys_avail[pa_indx] == pa) {
1070 				phys_avail[pa_indx] += PAGE_SIZE;
1071 			} else {
1072 				pa_indx++;
1073 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1074 					printf(
1075 		"Too many holes in the physical address space, giving up\n");
1076 					pa_indx--;
1077 					full = true;
1078 					goto do_dump_avail;
1079 				}
1080 				phys_avail[pa_indx++] = pa;	/* start */
1081 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1082 			}
1083 			physmem++;
1084 do_dump_avail:
1085 			if (dump_avail[da_indx] == pa) {
1086 				dump_avail[da_indx] += PAGE_SIZE;
1087 			} else {
1088 				da_indx++;
1089 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1090 					da_indx--;
1091 					goto do_next;
1092 				}
1093 				dump_avail[da_indx++] = pa; /* start */
1094 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1095 			}
1096 do_next:
1097 			if (full)
1098 				break;
1099 		}
1100 	}
1101 	*pte = 0;
1102 	invltlb();
1103 	if (memtest != 0)
1104 		printf("\n");
1105 
1106 	/*
1107 	 * XXX
1108 	 * The last chunk must contain at least one page plus the message
1109 	 * buffer to avoid complicating other code (message buffer address
1110 	 * calculation, etc.).
1111 	 */
1112 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1113 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1114 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1115 		phys_avail[pa_indx--] = 0;
1116 		phys_avail[pa_indx--] = 0;
1117 	}
1118 
1119 	Maxmem = atop(phys_avail[pa_indx]);
1120 
1121 	/* Trim off space for the message buffer. */
1122 	phys_avail[pa_indx] -= round_page(msgbufsize);
1123 
1124 	/* Map the message buffer. */
1125 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1126 	TSEXIT();
1127 }
1128 
1129 static caddr_t
native_parse_preload_data(u_int64_t modulep)1130 native_parse_preload_data(u_int64_t modulep)
1131 {
1132 	caddr_t kmdp;
1133 	char *envp;
1134 #ifdef DDB
1135 	vm_offset_t ksym_start;
1136 	vm_offset_t ksym_end;
1137 #endif
1138 
1139 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1140 	preload_bootstrap_relocate(KERNBASE);
1141 	kmdp = preload_search_by_type("elf kernel");
1142 	if (kmdp == NULL)
1143 		kmdp = preload_search_by_type("elf64 kernel");
1144 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1145 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1146 	if (envp != NULL)
1147 		envp += KERNBASE;
1148 	init_static_kenv(envp, 0);
1149 #ifdef DDB
1150 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1151 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1152 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1153 #endif
1154 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1155 
1156 	return (kmdp);
1157 }
1158 
1159 static void
native_clock_source_init(void)1160 native_clock_source_init(void)
1161 {
1162 	i8254_init();
1163 }
1164 
1165 static void
amd64_kdb_init(void)1166 amd64_kdb_init(void)
1167 {
1168 	kdb_init();
1169 #ifdef KDB
1170 	if (boothowto & RB_KDB)
1171 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1172 #endif
1173 }
1174 
1175 /* Set up the fast syscall stuff */
1176 void
amd64_conf_fast_syscall(void)1177 amd64_conf_fast_syscall(void)
1178 {
1179 	uint64_t msr;
1180 
1181 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1182 	wrmsr(MSR_EFER, msr);
1183 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1184 	    (u_int64_t)IDTVEC(fast_syscall));
1185 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1186 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1187 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1188 	wrmsr(MSR_STAR, msr);
1189 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1190 }
1191 
1192 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1193 amd64_bsp_pcpu_init1(struct pcpu *pc)
1194 {
1195 	struct user_segment_descriptor *gdt;
1196 
1197 	PCPU_SET(prvspace, pc);
1198 	gdt = *PCPU_PTR(gdt);
1199 	PCPU_SET(curthread, &thread0);
1200 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1201 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1202 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1203 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1204 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1205 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1206 	PCPU_SET(smp_tlb_gen, 1);
1207 }
1208 
1209 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1210 amd64_bsp_pcpu_init2(uint64_t rsp0)
1211 {
1212 
1213 	PCPU_SET(rsp0, rsp0);
1214 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1215 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1216 	PCPU_SET(curpcb, thread0.td_pcb);
1217 }
1218 
1219 void
amd64_bsp_ist_init(struct pcpu * pc)1220 amd64_bsp_ist_init(struct pcpu *pc)
1221 {
1222 	struct nmi_pcpu *np;
1223 	struct amd64tss *tssp;
1224 
1225 	tssp = &pc->pc_common_tss;
1226 
1227 	/* doublefault stack space, runs on ist1 */
1228 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1229 	np->np_pcpu = (register_t)pc;
1230 	tssp->tss_ist1 = (long)np;
1231 
1232 	/*
1233 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1234 	 * above the start of the ist2 stack.
1235 	 */
1236 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1237 	np->np_pcpu = (register_t)pc;
1238 	tssp->tss_ist2 = (long)np;
1239 
1240 	/*
1241 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1242 	 * above the start of the ist3 stack.
1243 	 */
1244 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1245 	np->np_pcpu = (register_t)pc;
1246 	tssp->tss_ist3 = (long)np;
1247 
1248 	/*
1249 	 * DB# stack, runs on ist4.
1250 	 */
1251 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1252 	np->np_pcpu = (register_t)pc;
1253 	tssp->tss_ist4 = (long)np;
1254 }
1255 
1256 /*
1257  * Calculate the kernel load address by inspecting page table created by loader.
1258  * The assumptions:
1259  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1260  *   aligned at 2M, below 4G (the latter is important for AP startup)
1261  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1262  * - kernel is mapped with 2M superpages
1263  * - all participating memory, i.e. kernel, modules, metadata,
1264  *   page table is accessible by pre-created 1:1 mapping
1265  *   (right now loader creates 1:1 mapping for lower 4G, and all
1266  *   memory is from there)
1267  * - there is a usable memory block right after the end of the
1268  *   mapped kernel and all modules/metadata, pointed to by
1269  *   physfree, for early allocations
1270  */
1271 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1272 amd64_loadaddr(void)
1273 {
1274 	pml4_entry_t *pml4e;
1275 	pdp_entry_t *pdpe;
1276 	pd_entry_t *pde;
1277 	uint64_t cr3;
1278 
1279 	cr3 = rcr3();
1280 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1281 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1282 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1283 	return (*pde & PG_FRAME);
1284 }
1285 
1286 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1287 hammer_time(u_int64_t modulep, u_int64_t physfree)
1288 {
1289 	caddr_t kmdp;
1290 	int gsel_tss, x;
1291 	struct pcpu *pc;
1292 	uint64_t rsp0;
1293 	char *env;
1294 	struct user_segment_descriptor *gdt;
1295 	struct region_descriptor r_gdt;
1296 	size_t kstack0_sz;
1297 
1298 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1299 
1300 	kernphys = amd64_loadaddr();
1301 
1302 	physfree += kernphys;
1303 
1304 	kmdp = init_ops.parse_preload_data(modulep);
1305 
1306 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1307 	    MODINFOMD_EFI_MAP) != NULL;
1308 
1309 	if (!efi_boot) {
1310 		/* Tell the bios to warmboot next time */
1311 		atomic_store_short((u_short *)0x472, 0x1234);
1312 	}
1313 
1314 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1315 	physfree = roundup2(physfree, PAGE_SIZE);
1316 
1317 	identify_cpu1();
1318 	identify_hypervisor();
1319 	identify_hypervisor_smbios();
1320 	identify_cpu_fixup_bsp();
1321 	identify_cpu2();
1322 	initializecpucache();
1323 
1324 	/*
1325 	 * Check for pti, pcid, and invpcid before ifuncs are
1326 	 * resolved, to correctly select the implementation for
1327 	 * pmap_activate_sw_mode().
1328 	 */
1329 	pti = pti_get_default();
1330 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1331 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1332 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1333 		invpcid_works = (cpu_stdext_feature &
1334 		    CPUID_STDEXT_INVPCID) != 0;
1335 	} else {
1336 		pmap_pcid_enabled = 0;
1337 	}
1338 
1339 	/*
1340 	 * Now we can do small core initialization, after the PCID
1341 	 * CPU features and user knobs are evaluated.
1342 	 */
1343 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1344 	    &pmap_pcid_invlpg_workaround_uena);
1345 	cpu_init_small_core();
1346 
1347 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1348 		use_xsave = 1;
1349 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1350 	}
1351 
1352 	link_elf_ireloc(kmdp);
1353 
1354 	/*
1355 	 * This may be done better later if it gets more high level
1356 	 * components in it. If so just link td->td_proc here.
1357 	 */
1358 	proc_linkup0(&proc0, &thread0);
1359 
1360 	/* Init basic tunables, hz etc */
1361 	init_param1();
1362 
1363 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1364 	thread0.td_kstack_pages = kstack_pages;
1365 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1366 	bzero((void *)thread0.td_kstack, kstack0_sz);
1367 	physfree += kstack0_sz;
1368 
1369 	/*
1370 	 * Initialize enough of thread0 for delayed invalidation to
1371 	 * work very early.  Rely on thread0.td_base_pri
1372 	 * zero-initialization, it is reset to PVM at proc0_init().
1373 	 */
1374 	pmap_thread_init_invl_gen(&thread0);
1375 
1376 	pc = &temp_bsp_pcpu;
1377 	pcpu_init(pc, 0, sizeof(struct pcpu));
1378 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1379 
1380 	/*
1381 	 * make gdt memory segments
1382 	 */
1383 	for (x = 0; x < NGDT; x++) {
1384 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1385 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1386 			ssdtosd(&gdt_segs[x], &gdt[x]);
1387 	}
1388 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1389 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1390 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1391 
1392 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1393 	r_gdt.rd_base = (long)gdt;
1394 	lgdt(&r_gdt);
1395 
1396 	wrmsr(MSR_FSBASE, 0);		/* User value */
1397 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1398 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1399 
1400 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1401 	physfree += DPCPU_SIZE;
1402 	amd64_bsp_pcpu_init1(pc);
1403 	/* Non-late cninit() and printf() can be moved up to here. */
1404 
1405 	/*
1406 	 * Initialize mutexes.
1407 	 *
1408 	 * icu_lock: in order to allow an interrupt to occur in a critical
1409 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1410 	 *	     must be able to get the icu lock, so it can't be
1411 	 *	     under witness.
1412 	 */
1413 	mutex_init();
1414 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1415 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1416 
1417 	/* exceptions */
1418 	for (x = 0; x < NIDT; x++)
1419 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1420 		    SEL_KPL, 0);
1421 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1422 	    SEL_KPL, 0);
1423 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1424 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1425 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1426 	    SEL_UPL, 0);
1427 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1428 	    SEL_UPL, 0);
1429 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1430 	    SEL_KPL, 0);
1431 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1432 	    SEL_KPL, 0);
1433 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1434 	    SEL_KPL, 0);
1435 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1436 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1437 	    SDT_SYSIGT, SEL_KPL, 0);
1438 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1441 	    SDT_SYSIGT, SEL_KPL, 0);
1442 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1443 	    SEL_KPL, 0);
1444 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1445 	    SEL_KPL, 0);
1446 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1449 	    SEL_KPL, 0);
1450 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1453 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1454 	    SEL_KPL, 0);
1455 #ifdef KDTRACE_HOOKS
1456 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1457 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1458 #endif
1459 #ifdef XENHVM
1460 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1461 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1462 #endif
1463 	r_idt.rd_limit = sizeof(idt0) - 1;
1464 	r_idt.rd_base = (long) idt;
1465 	lidt(&r_idt);
1466 
1467 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1468 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1469 
1470 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1471 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1472 
1473 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1474 	    &syscall_ret_l1d_flush_mode);
1475 
1476 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1477 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1478 
1479 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1480 
1481 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1482 	    &x86_rngds_mitg_enable);
1483 
1484 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1485 	    &zenbleed_enable);
1486 	zenbleed_sanitize_enable();
1487 
1488 	finishidentcpu();	/* Final stage of CPU initialization */
1489 
1490 	invlpgb_works = (amd_extended_feature_extensions &
1491 	    AMDFEID_INVLPGB) != 0;
1492 	TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1493 	if (invlpgb_works)
1494 		invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1495 
1496 	/*
1497 	 * Initialize the clock before the console so that console
1498 	 * initialization can use DELAY().
1499 	 */
1500 	clock_init();
1501 
1502 	initializecpu();	/* Initialize CPU registers */
1503 
1504 	amd64_bsp_ist_init(pc);
1505 
1506 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1507 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1508 	    IOPERM_BITMAP_SIZE;
1509 
1510 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1511 	ltr(gsel_tss);
1512 
1513 	amd64_conf_fast_syscall();
1514 
1515 	/*
1516 	 * We initialize the PCB pointer early so that exception
1517 	 * handlers will work.  Also set up td_critnest to short-cut
1518 	 * the page fault handler.
1519 	 */
1520 	cpu_max_ext_state_size = sizeof(struct savefpu);
1521 	set_top_of_stack_td(&thread0);
1522 	thread0.td_pcb = get_pcb_td(&thread0);
1523 	thread0.td_critnest = 1;
1524 
1525 	/*
1526 	 * The console and kdb should be initialized even earlier than here,
1527 	 * but some console drivers don't work until after getmemsize().
1528 	 * Default to late console initialization to support these drivers.
1529 	 * This loses mainly printf()s in getmemsize() and early debugging.
1530 	 */
1531 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1532 	if (!late_console) {
1533 		cninit();
1534 		amd64_kdb_init();
1535 	}
1536 
1537 	getmemsize(kmdp, physfree);
1538 	init_param2(physmem);
1539 
1540 	/* now running on new page tables, configured,and u/iom is accessible */
1541 
1542 #ifdef DEV_PCI
1543         /* This call might adjust phys_avail[]. */
1544         pci_early_quirks();
1545 #endif
1546 
1547 	if (late_console)
1548 		cninit();
1549 
1550 	/*
1551 	 * Dump the boot metadata. We have to wait for cninit() since console
1552 	 * output is required. If it's grossly incorrect the kernel will never
1553 	 * make it this far.
1554 	 */
1555 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1556 		preload_dump();
1557 
1558 #ifdef DEV_ISA
1559 #ifdef DEV_ATPIC
1560 	elcr_probe();
1561 	atpic_startup();
1562 #else
1563 	/* Reset and mask the atpics and leave them shut down. */
1564 	atpic_reset();
1565 
1566 	/*
1567 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1568 	 * interrupt handler.
1569 	 */
1570 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1571 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1572 #endif
1573 #else
1574 #error "have you forgotten the isa device?"
1575 #endif
1576 
1577 	if (late_console)
1578 		amd64_kdb_init();
1579 
1580 	msgbufinit(msgbufp, msgbufsize);
1581 	fpuinit();
1582 
1583 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1584 	rsp0 = thread0.td_md.md_stack_base;
1585 	/* Ensure the stack is aligned to 16 bytes */
1586 	rsp0 &= ~0xFul;
1587 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1588 	amd64_bsp_pcpu_init2(rsp0);
1589 
1590 	/* transfer to user mode */
1591 
1592 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1593 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1594 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1595 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1596 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1597 
1598 	load_ds(_udatasel);
1599 	load_es(_udatasel);
1600 	load_fs(_ufssel);
1601 
1602 	/* setup proc 0's pcb */
1603 	thread0.td_pcb->pcb_flags = 0;
1604 
1605         env = kern_getenv("kernelname");
1606 	if (env != NULL)
1607 		strlcpy(kernelname, env, sizeof(kernelname));
1608 
1609 	kcsan_cpu_init(0);
1610 
1611 #ifdef FDT
1612 	x86_init_fdt();
1613 #endif
1614 	thread0.td_critnest = 0;
1615 
1616 	kasan_init();
1617 	kmsan_init();
1618 
1619 	TSEXIT();
1620 
1621 	/* Location of kernel stack for locore */
1622 	return (thread0.td_md.md_stack_base);
1623 }
1624 
1625 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1626 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1627 {
1628 
1629 	pcpu->pc_acpi_id = 0xffffffff;
1630 }
1631 
1632 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1633 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1634 {
1635 	struct bios_smap *smapbase;
1636 	struct bios_smap_xattr smap;
1637 	caddr_t kmdp;
1638 	uint32_t *smapattr;
1639 	int count, error, i;
1640 
1641 	/* Retrieve the system memory map from the loader. */
1642 	kmdp = preload_search_by_type("elf kernel");
1643 	if (kmdp == NULL)
1644 		kmdp = preload_search_by_type("elf64 kernel");
1645 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1646 	    MODINFO_METADATA | MODINFOMD_SMAP);
1647 	if (smapbase == NULL)
1648 		return (0);
1649 	smapattr = (uint32_t *)preload_search_info(kmdp,
1650 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1651 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1652 	error = 0;
1653 	for (i = 0; i < count; i++) {
1654 		smap.base = smapbase[i].base;
1655 		smap.length = smapbase[i].length;
1656 		smap.type = smapbase[i].type;
1657 		if (smapattr != NULL)
1658 			smap.xattr = smapattr[i];
1659 		else
1660 			smap.xattr = 0;
1661 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1662 	}
1663 	return (error);
1664 }
1665 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1666     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1667     smap_sysctl_handler, "S,bios_smap_xattr",
1668     "Raw BIOS SMAP data");
1669 
1670 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1671 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1672 {
1673 	struct efi_map_header *efihdr;
1674 	caddr_t kmdp;
1675 	uint32_t efisize;
1676 
1677 	kmdp = preload_search_by_type("elf kernel");
1678 	if (kmdp == NULL)
1679 		kmdp = preload_search_by_type("elf64 kernel");
1680 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1681 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1682 	if (efihdr == NULL)
1683 		return (0);
1684 	efisize = *((uint32_t *)efihdr - 1);
1685 	return (SYSCTL_OUT(req, efihdr, efisize));
1686 }
1687 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1688     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1689     efi_map_sysctl_handler, "S,efi_map_header",
1690     "Raw EFI Memory Map");
1691 
1692 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1693 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1694 {
1695 	char *arch;
1696 	caddr_t kmdp;
1697 
1698 	kmdp = preload_search_by_type("elf kernel");
1699 	if (kmdp == NULL)
1700 		kmdp = preload_search_by_type("elf64 kernel");
1701 
1702 	arch = (char *)preload_search_info(kmdp,
1703 	    MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1704 	if (arch == NULL)
1705 		return (0);
1706 
1707 	return (SYSCTL_OUT_STR(req, arch));
1708 }
1709 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1710     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1711     efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1712 
1713 void
spinlock_enter(void)1714 spinlock_enter(void)
1715 {
1716 	struct thread *td;
1717 	register_t flags;
1718 
1719 	td = curthread;
1720 	if (td->td_md.md_spinlock_count == 0) {
1721 		flags = intr_disable();
1722 		td->td_md.md_spinlock_count = 1;
1723 		td->td_md.md_saved_flags = flags;
1724 		critical_enter();
1725 	} else
1726 		td->td_md.md_spinlock_count++;
1727 }
1728 
1729 void
spinlock_exit(void)1730 spinlock_exit(void)
1731 {
1732 	struct thread *td;
1733 	register_t flags;
1734 
1735 	td = curthread;
1736 	flags = td->td_md.md_saved_flags;
1737 	td->td_md.md_spinlock_count--;
1738 	if (td->td_md.md_spinlock_count == 0) {
1739 		critical_exit();
1740 		intr_restore(flags);
1741 	}
1742 }
1743 
1744 /*
1745  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1746  * we want to start a backtrace from the function that caused us to enter
1747  * the debugger. We have the context in the trapframe, but base the trace
1748  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1749  * enough for a backtrace.
1750  */
1751 void
makectx(struct trapframe * tf,struct pcb * pcb)1752 makectx(struct trapframe *tf, struct pcb *pcb)
1753 {
1754 
1755 	pcb->pcb_r12 = tf->tf_r12;
1756 	pcb->pcb_r13 = tf->tf_r13;
1757 	pcb->pcb_r14 = tf->tf_r14;
1758 	pcb->pcb_r15 = tf->tf_r15;
1759 	pcb->pcb_rbp = tf->tf_rbp;
1760 	pcb->pcb_rbx = tf->tf_rbx;
1761 	pcb->pcb_rip = tf->tf_rip;
1762 	pcb->pcb_rsp = tf->tf_rsp;
1763 }
1764 
1765 /*
1766  * The pcb_flags is only modified by current thread, or by other threads
1767  * when current thread is stopped.  However, current thread may change it
1768  * from the interrupt context in cpu_switch(), or in the trap handler.
1769  * When we read-modify-write pcb_flags from C sources, compiler may generate
1770  * code that is not atomic regarding the interrupt handler.  If a trap or
1771  * interrupt happens and any flag is modified from the handler, it can be
1772  * clobbered with the cached value later.  Therefore, we implement setting
1773  * and clearing flags with single-instruction functions, which do not race
1774  * with possible modification of the flags from the trap or interrupt context,
1775  * because traps and interrupts are executed only on instruction boundary.
1776  */
1777 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1778 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1779 {
1780 
1781 	__asm __volatile("orl %1,%0"
1782 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1783 	    : "cc", "memory");
1784 
1785 }
1786 
1787 /*
1788  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1789  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1790  * pcb if user space modified the bases.  We must save on the context
1791  * switch or if the return to usermode happens through the doreti.
1792  *
1793  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1794  * which have a consequence that the base MSRs must be saved each time
1795  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1796  * context switches.
1797  */
1798 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1799 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1800 {
1801 	register_t r;
1802 
1803 	if (curpcb == pcb &&
1804 	    (flags & PCB_FULL_IRET) != 0 &&
1805 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1806 		r = intr_disable();
1807 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1808 			if (rfs() == _ufssel)
1809 				pcb->pcb_fsbase = rdfsbase();
1810 			if (rgs() == _ugssel)
1811 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1812 		}
1813 		set_pcb_flags_raw(pcb, flags);
1814 		intr_restore(r);
1815 	} else {
1816 		set_pcb_flags_raw(pcb, flags);
1817 	}
1818 }
1819 
1820 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1821 {
1822 
1823 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1824 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1825 }
1826 
1827 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1828 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1829 {
1830 
1831 	__asm __volatile("andl %1,%0"
1832 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1833 	    : "cc", "memory");
1834 }
1835 
1836 #ifdef KDB
1837 
1838 /*
1839  * Provide inb() and outb() as functions.  They are normally only available as
1840  * inline functions, thus cannot be called from the debugger.
1841  */
1842 
1843 /* silence compiler warnings */
1844 u_char inb_(u_short);
1845 void outb_(u_short, u_char);
1846 
1847 u_char
inb_(u_short port)1848 inb_(u_short port)
1849 {
1850 	return inb(port);
1851 }
1852 
1853 void
outb_(u_short port,u_char data)1854 outb_(u_short port, u_char data)
1855 {
1856 	outb(port, data);
1857 }
1858 
1859 #endif /* KDB */
1860 
1861 #undef memset
1862 #undef memmove
1863 #undef memcpy
1864 
1865 void	*memset_std(void *buf, int c, size_t len);
1866 void	*memset_erms(void *buf, int c, size_t len);
1867 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1868 	    size_t len);
1869 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1870 	    size_t len);
1871 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1872 	    size_t len);
1873 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1874 	    size_t len);
1875 
1876 #ifdef KCSAN
1877 /*
1878  * These fail to build as ifuncs when used with KCSAN.
1879  */
1880 void *
memset(void * buf,int c,size_t len)1881 memset(void *buf, int c, size_t len)
1882 {
1883 
1884 	return (memset_std(buf, c, len));
1885 }
1886 
1887 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1888 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1889 {
1890 
1891 	return (memmove_std(dst, src, len));
1892 }
1893 
1894 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1895 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1896 {
1897 
1898 	return (memcpy_std(dst, src, len));
1899 }
1900 #else
1901 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1902 {
1903 
1904 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1905 	    memset_erms : memset_std);
1906 }
1907 
1908 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1909     size_t))
1910 {
1911 
1912 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1913 	    memmove_erms : memmove_std);
1914 }
1915 
1916 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1917 {
1918 
1919 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1920 	    memcpy_erms : memcpy_std);
1921 }
1922 #endif
1923 
1924 void	pagezero_std(void *addr);
1925 void	pagezero_erms(void *addr);
1926 DEFINE_IFUNC(, void , pagezero, (void *))
1927 {
1928 
1929 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1930 	    pagezero_erms : pagezero_std);
1931 }
1932