xref: /freebsd/sys/amd64/amd64/machdep.c (revision b72ae900d4348118829fe04abdc11b620930c30f)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52 
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <dev/smbios/smbios.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167 
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170 
171 /* Preload data parse function */
172 static void native_parse_preload_data(u_int64_t);
173 
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(vm_paddr_t *, int *);
176 
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 	.parse_preload_data =		native_parse_preload_data,
180 	.early_clock_source_init =	native_clock_source_init,
181 	.early_delay =			i8254_delay,
182 	.parse_memmap =			native_parse_memmap,
183 };
184 
185 /*
186  * Physical address of the EFI System Table. Stashed from the metadata hints
187  * passed into the kernel and used by the EFI code to call runtime services.
188  */
189 vm_paddr_t efi_systbl_phys;
190 
191 /* Intel ICH registers */
192 #define ICH_PMBASE	0x400
193 #define ICH_SMI_EN	ICH_PMBASE + 0x30
194 
195 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
196 
197 int cold = 1;
198 
199 long Maxmem = 0;
200 long realmem = 0;
201 int late_console = 1;
202 
203 struct kva_md_info kmi;
204 
205 struct region_descriptor r_idt;
206 
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209 
210 struct mtx icu_lock;
211 
212 struct mem_range_softc mem_range_softc;
213 
214 struct mtx dt_lock;	/* lock for GDT and LDT */
215 
216 void (*vmm_suspend_p)(void);
217 void (*vmm_resume_p)(void);
218 
219 bool efi_boot;
220 
221 static void
cpu_startup(void * dummy)222 cpu_startup(void *dummy)
223 {
224 	uintmax_t memsize;
225 	char *sysenv;
226 
227 	/*
228 	 * On MacBooks, we need to disallow the legacy USB circuit to
229 	 * generate an SMI# because this can cause several problems,
230 	 * namely: incorrect CPU frequency detection and failure to
231 	 * start the APs.
232 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
233 	 * Enable register) of the Intel ICH LPC Interface Bridge.
234 	 */
235 	sysenv = kern_getenv("smbios.system.product");
236 	if (sysenv != NULL) {
237 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
238 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
240 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
241 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
243 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
244 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
245 			if (bootverbose)
246 				printf("Disabling LEGACY_USB_EN bit on "
247 				    "Intel ICH.\n");
248 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
249 		}
250 		freeenv(sysenv);
251 	}
252 
253 	/*
254 	 * Good {morning,afternoon,evening,night}.
255 	 */
256 	startrtclock();
257 	printcpuinfo();
258 
259 	/*
260 	 * Display physical memory if SMBIOS reports reasonable amount.
261 	 */
262 	memsize = 0;
263 	sysenv = kern_getenv("smbios.memory.enabled");
264 	if (sysenv != NULL) {
265 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
266 		freeenv(sysenv);
267 	}
268 	if (memsize < ptoa((uintmax_t)vm_free_count()))
269 		memsize = ptoa((uintmax_t)Maxmem);
270 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
271 	realmem = atop(memsize);
272 
273 	/*
274 	 * Display any holes after the first chunk of extended memory.
275 	 */
276 	if (bootverbose) {
277 		int indx;
278 
279 		printf("Physical memory chunk(s):\n");
280 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
281 			vm_paddr_t size;
282 
283 			size = phys_avail[indx + 1] - phys_avail[indx];
284 			printf(
285 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
286 			    (uintmax_t)phys_avail[indx],
287 			    (uintmax_t)phys_avail[indx + 1] - 1,
288 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
289 		}
290 	}
291 
292 	vm_ksubmap_init(&kmi);
293 
294 	printf("avail memory = %ju (%ju MB)\n",
295 	    ptoa((uintmax_t)vm_free_count()),
296 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
297 #ifdef DEV_PCI
298 	if (bootverbose && intel_graphics_stolen_base != 0)
299 		printf("intel stolen mem: base %#jx size %ju MB\n",
300 		    (uintmax_t)intel_graphics_stolen_base,
301 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
302 #endif
303 
304 	/*
305 	 * Set up buffers, so they can be used to read disk labels.
306 	 */
307 	bufinit();
308 	vm_pager_bufferinit();
309 
310 	cpu_setregs();
311 }
312 
313 static void
late_ifunc_resolve(void * dummy __unused)314 late_ifunc_resolve(void *dummy __unused)
315 {
316 	link_elf_late_ireloc();
317 }
318 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
319 
320 
321 void
cpu_setregs(void)322 cpu_setregs(void)
323 {
324 	register_t cr0;
325 
326 	TSENTER();
327 	cr0 = rcr0();
328 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
329 	TSENTER2("load_cr0");
330 	load_cr0(cr0);
331 	TSEXIT2("load_cr0");
332 	TSEXIT();
333 }
334 
335 /*
336  * Initialize amd64 and configure to run kernel
337  */
338 
339 /*
340  * Initialize segments & interrupt table
341  */
342 static struct gate_descriptor idt0[NIDT];
343 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
344 
345 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
346 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
347 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
348 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
349 CTASSERT(sizeof(struct nmi_pcpu) == 16);
350 
351 /*
352  * Software prototypes -- in more palatable form.
353  *
354  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
355  * slots as corresponding segments for i386 kernel.
356  */
357 struct soft_segment_descriptor gdt_segs[] = {
358 [GNULL_SEL] = { /* 0 Null Descriptor */
359 	.ssd_base = 0x0,
360 	.ssd_limit = 0x0,
361 	.ssd_type = 0,
362 	.ssd_dpl = 0,
363 	.ssd_p = 0,
364 	.ssd_long = 0,
365 	.ssd_def32 = 0,
366 	.ssd_gran = 0		},
367 [GNULL2_SEL] = { /*	1 Null Descriptor */
368 	.ssd_base = 0x0,
369 	.ssd_limit = 0x0,
370 	.ssd_type = 0,
371 	.ssd_dpl = 0,
372 	.ssd_p = 0,
373 	.ssd_long = 0,
374 	.ssd_def32 = 0,
375 	.ssd_gran = 0		},
376 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
377 	.ssd_base = 0x0,
378 	.ssd_limit = 0xfffff,
379 	.ssd_type = SDT_MEMRWA,
380 	.ssd_dpl = SEL_UPL,
381 	.ssd_p = 1,
382 	.ssd_long = 0,
383 	.ssd_def32 = 1,
384 	.ssd_gran = 1		},
385 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
386 	.ssd_base = 0x0,
387 	.ssd_limit = 0xfffff,
388 	.ssd_type = SDT_MEMRWA,
389 	.ssd_dpl = SEL_UPL,
390 	.ssd_p = 1,
391 	.ssd_long = 0,
392 	.ssd_def32 = 1,
393 	.ssd_gran = 1		},
394 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
395 	.ssd_base = 0x0,
396 	.ssd_limit = 0xfffff,
397 	.ssd_type = SDT_MEMERA,
398 	.ssd_dpl = SEL_KPL,
399 	.ssd_p = 1,
400 	.ssd_long = 1,
401 	.ssd_def32 = 0,
402 	.ssd_gran = 1		},
403 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
404 	.ssd_base = 0x0,
405 	.ssd_limit = 0xfffff,
406 	.ssd_type = SDT_MEMRWA,
407 	.ssd_dpl = SEL_KPL,
408 	.ssd_p = 1,
409 	.ssd_long = 1,
410 	.ssd_def32 = 0,
411 	.ssd_gran = 1		},
412 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
413 	.ssd_base = 0x0,
414 	.ssd_limit = 0xfffff,
415 	.ssd_type = SDT_MEMERA,
416 	.ssd_dpl = SEL_UPL,
417 	.ssd_p = 1,
418 	.ssd_long = 0,
419 	.ssd_def32 = 1,
420 	.ssd_gran = 1		},
421 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
422 	.ssd_base = 0x0,
423 	.ssd_limit = 0xfffff,
424 	.ssd_type = SDT_MEMRWA,
425 	.ssd_dpl = SEL_UPL,
426 	.ssd_p = 1,
427 	.ssd_long = 0,
428 	.ssd_def32 = 1,
429 	.ssd_gran = 1		},
430 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
431 	.ssd_base = 0x0,
432 	.ssd_limit = 0xfffff,
433 	.ssd_type = SDT_MEMERA,
434 	.ssd_dpl = SEL_UPL,
435 	.ssd_p = 1,
436 	.ssd_long = 1,
437 	.ssd_def32 = 0,
438 	.ssd_gran = 1		},
439 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
440 	.ssd_base = 0x0,
441 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
442 	.ssd_type = SDT_SYSTSS,
443 	.ssd_dpl = SEL_KPL,
444 	.ssd_p = 1,
445 	.ssd_long = 0,
446 	.ssd_def32 = 0,
447 	.ssd_gran = 0		},
448 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
449 	.ssd_base = 0x0,
450 	.ssd_limit = 0x0,
451 	.ssd_type = 0,
452 	.ssd_dpl = 0,
453 	.ssd_p = 0,
454 	.ssd_long = 0,
455 	.ssd_def32 = 0,
456 	.ssd_gran = 0		},
457 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
458 	.ssd_base = 0x0,
459 	.ssd_limit = 0x0,
460 	.ssd_type = 0,
461 	.ssd_dpl = 0,
462 	.ssd_p = 0,
463 	.ssd_long = 0,
464 	.ssd_def32 = 0,
465 	.ssd_gran = 0		},
466 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
467 	.ssd_base = 0x0,
468 	.ssd_limit = 0x0,
469 	.ssd_type = 0,
470 	.ssd_dpl = 0,
471 	.ssd_p = 0,
472 	.ssd_long = 0,
473 	.ssd_def32 = 0,
474 	.ssd_gran = 0		},
475 };
476 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
477 
478 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)479 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
480 {
481 	struct gate_descriptor *ip;
482 
483 	ip = idt + idx;
484 	ip->gd_looffset = (uintptr_t)func;
485 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
486 	ip->gd_ist = ist;
487 	ip->gd_xx = 0;
488 	ip->gd_type = typ;
489 	ip->gd_dpl = dpl;
490 	ip->gd_p = 1;
491 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
492 }
493 
494 extern inthand_t
495 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
496 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
497 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
498 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
499 	IDTVEC(xmm), IDTVEC(dblfault),
500 	IDTVEC(div_pti), IDTVEC(bpt_pti),
501 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
502 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
503 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
504 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
505 	IDTVEC(xmm_pti),
506 #ifdef KDTRACE_HOOKS
507 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
508 #endif
509 #ifdef XENHVM
510 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
511 #endif
512 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
513 	IDTVEC(fast_syscall_pti);
514 
515 #ifdef DDB
516 /*
517  * Display the index and function name of any IDT entries that don't use
518  * the default 'rsvd' entry point.
519  */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)520 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
521 {
522 	struct gate_descriptor *ip;
523 	int idx;
524 	uintptr_t func;
525 
526 	ip = idt;
527 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
528 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
529 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
530 			db_printf("%3d\t", idx);
531 			db_printsym(func, DB_STGY_PROC);
532 			db_printf("\n");
533 		}
534 		ip++;
535 	}
536 }
537 
538 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)539 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
540 {
541 	struct {
542 		uint16_t limit;
543 		uint64_t base;
544 	} __packed idtr, gdtr;
545 	uint16_t ldt, tr;
546 
547 	__asm __volatile("sidt %0" : "=m" (idtr));
548 	db_printf("idtr\t0x%016lx/%04x\n",
549 	    (u_long)idtr.base, (u_int)idtr.limit);
550 	__asm __volatile("sgdt %0" : "=m" (gdtr));
551 	db_printf("gdtr\t0x%016lx/%04x\n",
552 	    (u_long)gdtr.base, (u_int)gdtr.limit);
553 	__asm __volatile("sldt %0" : "=r" (ldt));
554 	db_printf("ldtr\t0x%04x\n", ldt);
555 	__asm __volatile("str %0" : "=r" (tr));
556 	db_printf("tr\t0x%04x\n", tr);
557 	db_printf("cr0\t0x%016lx\n", rcr0());
558 	db_printf("cr2\t0x%016lx\n", rcr2());
559 	db_printf("cr3\t0x%016lx\n", rcr3());
560 	db_printf("cr4\t0x%016lx\n", rcr4());
561 	if (rcr4() & CR4_XSAVE)
562 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
563 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
564 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
565 		db_printf("FEATURES_CTL\t%016lx\n",
566 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
567 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
568 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
569 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
570 }
571 
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)572 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
573 {
574 
575 	db_printf("dr0\t0x%016lx\n", rdr0());
576 	db_printf("dr1\t0x%016lx\n", rdr1());
577 	db_printf("dr2\t0x%016lx\n", rdr2());
578 	db_printf("dr3\t0x%016lx\n", rdr3());
579 	db_printf("dr6\t0x%016lx\n", rdr6());
580 	db_printf("dr7\t0x%016lx\n", rdr7());
581 }
582 #endif
583 
584 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)585 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
586 {
587 
588 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
589 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
590 	ssd->ssd_type  = sd->sd_type;
591 	ssd->ssd_dpl   = sd->sd_dpl;
592 	ssd->ssd_p     = sd->sd_p;
593 	ssd->ssd_long  = sd->sd_long;
594 	ssd->ssd_def32 = sd->sd_def32;
595 	ssd->ssd_gran  = sd->sd_gran;
596 }
597 
598 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)599 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
600 {
601 
602 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
603 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
604 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
605 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
606 	sd->sd_type  = ssd->ssd_type;
607 	sd->sd_dpl   = ssd->ssd_dpl;
608 	sd->sd_p     = ssd->ssd_p;
609 	sd->sd_long  = ssd->ssd_long;
610 	sd->sd_def32 = ssd->ssd_def32;
611 	sd->sd_gran  = ssd->ssd_gran;
612 }
613 
614 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)615 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
616 {
617 
618 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
619 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
620 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
621 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
622 	sd->sd_type  = ssd->ssd_type;
623 	sd->sd_dpl   = ssd->ssd_dpl;
624 	sd->sd_p     = ssd->ssd_p;
625 	sd->sd_gran  = ssd->ssd_gran;
626 }
627 
628 u_int basemem;
629 
630 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)631 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
632     int *physmap_idxp)
633 {
634 	int i, insert_idx, physmap_idx;
635 
636 	physmap_idx = *physmap_idxp;
637 
638 	if (length == 0)
639 		return (1);
640 
641 	/*
642 	 * Find insertion point while checking for overlap.  Start off by
643 	 * assuming the new entry will be added to the end.
644 	 *
645 	 * NB: physmap_idx points to the next free slot.
646 	 */
647 	insert_idx = physmap_idx;
648 	for (i = 0; i <= physmap_idx; i += 2) {
649 		if (base < physmap[i + 1]) {
650 			if (base + length <= physmap[i]) {
651 				insert_idx = i;
652 				break;
653 			}
654 			if (boothowto & RB_VERBOSE)
655 				printf(
656 		    "Overlapping memory regions, ignoring second region\n");
657 			return (1);
658 		}
659 	}
660 
661 	/* See if we can prepend to the next entry. */
662 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
663 		physmap[insert_idx] = base;
664 		return (1);
665 	}
666 
667 	/* See if we can append to the previous entry. */
668 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
669 		physmap[insert_idx - 1] += length;
670 		return (1);
671 	}
672 
673 	physmap_idx += 2;
674 	*physmap_idxp = physmap_idx;
675 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
676 		printf(
677 		"Too many segments in the physical address map, giving up\n");
678 		return (0);
679 	}
680 
681 	/*
682 	 * Move the last 'N' entries down to make room for the new
683 	 * entry if needed.
684 	 */
685 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
686 		physmap[i] = physmap[i - 2];
687 		physmap[i + 1] = physmap[i - 1];
688 	}
689 
690 	/* Insert the new entry. */
691 	physmap[insert_idx] = base;
692 	physmap[insert_idx + 1] = base + length;
693 	return (1);
694 }
695 
696 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)697 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
698                       vm_paddr_t *physmap, int *physmap_idx)
699 {
700 	struct bios_smap *smap, *smapend;
701 
702 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
703 
704 	for (smap = smapbase; smap < smapend; smap++) {
705 		if (boothowto & RB_VERBOSE)
706 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
707 			    smap->type, smap->base, smap->length);
708 
709 		if (smap->type != SMAP_TYPE_MEMORY)
710 			continue;
711 
712 		if (!add_physmap_entry(smap->base, smap->length, physmap,
713 		    physmap_idx))
714 			break;
715 	}
716 }
717 
718 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)719 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
720     int *physmap_idx)
721 {
722 	struct efi_md *map, *p;
723 	const char *type;
724 	size_t efisz;
725 	int ndesc, i;
726 
727 	static const char *types[] = {
728 		"Reserved",
729 		"LoaderCode",
730 		"LoaderData",
731 		"BootServicesCode",
732 		"BootServicesData",
733 		"RuntimeServicesCode",
734 		"RuntimeServicesData",
735 		"ConventionalMemory",
736 		"UnusableMemory",
737 		"ACPIReclaimMemory",
738 		"ACPIMemoryNVS",
739 		"MemoryMappedIO",
740 		"MemoryMappedIOPortSpace",
741 		"PalCode",
742 		"PersistentMemory"
743 	};
744 
745 	/*
746 	 * Memory map data provided by UEFI via the GetMemoryMap
747 	 * Boot Services API.
748 	 */
749 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
750 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
751 
752 	if (efihdr->descriptor_size == 0)
753 		return;
754 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
755 
756 	if (boothowto & RB_VERBOSE)
757 		printf("%23s %12s %12s %8s %4s\n",
758 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
759 
760 	for (i = 0, p = map; i < ndesc; i++,
761 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
762 		if (boothowto & RB_VERBOSE) {
763 			if (p->md_type < nitems(types))
764 				type = types[p->md_type];
765 			else
766 				type = "<INVALID>";
767 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
768 			    p->md_virt, p->md_pages);
769 			if (p->md_attr & EFI_MD_ATTR_UC)
770 				printf("UC ");
771 			if (p->md_attr & EFI_MD_ATTR_WC)
772 				printf("WC ");
773 			if (p->md_attr & EFI_MD_ATTR_WT)
774 				printf("WT ");
775 			if (p->md_attr & EFI_MD_ATTR_WB)
776 				printf("WB ");
777 			if (p->md_attr & EFI_MD_ATTR_UCE)
778 				printf("UCE ");
779 			if (p->md_attr & EFI_MD_ATTR_WP)
780 				printf("WP ");
781 			if (p->md_attr & EFI_MD_ATTR_RP)
782 				printf("RP ");
783 			if (p->md_attr & EFI_MD_ATTR_XP)
784 				printf("XP ");
785 			if (p->md_attr & EFI_MD_ATTR_NV)
786 				printf("NV ");
787 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
788 				printf("MORE_RELIABLE ");
789 			if (p->md_attr & EFI_MD_ATTR_RO)
790 				printf("RO ");
791 			if (p->md_attr & EFI_MD_ATTR_RT)
792 				printf("RUNTIME");
793 			printf("\n");
794 		}
795 
796 		switch (p->md_type) {
797 		case EFI_MD_TYPE_CODE:
798 		case EFI_MD_TYPE_DATA:
799 		case EFI_MD_TYPE_BS_CODE:
800 		case EFI_MD_TYPE_BS_DATA:
801 		case EFI_MD_TYPE_FREE:
802 			/*
803 			 * We're allowed to use any entry with these types.
804 			 */
805 			break;
806 		default:
807 			continue;
808 		}
809 
810 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
811 		    physmap, physmap_idx))
812 			break;
813 	}
814 }
815 
816 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)817 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
818 {
819 	struct bios_smap *smap;
820 	struct efi_map_header *efihdr;
821 	u_int32_t size;
822 
823 	/*
824 	 * Memory map from INT 15:E820.
825 	 *
826 	 * subr_module.c says:
827 	 * "Consumer may safely assume that size value precedes data."
828 	 * ie: an int32_t immediately precedes smap.
829 	 */
830 
831 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
832 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
833 	smap = (struct bios_smap *)preload_search_info(preload_kmdp,
834 	    MODINFO_METADATA | MODINFOMD_SMAP);
835 	if (efihdr == NULL && smap == NULL)
836 		panic("No BIOS smap or EFI map info from loader!");
837 
838 	if (efihdr != NULL) {
839 		add_efi_map_entries(efihdr, physmap, physmap_idx);
840 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
841 	} else {
842 		size = *((u_int32_t *)smap - 1);
843 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
844 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
845 	}
846 }
847 
848 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
849 
850 /*
851  * Populate the (physmap) array with base/bound pairs describing the
852  * available physical memory in the system, then test this memory and
853  * build the phys_avail array describing the actually-available memory.
854  *
855  * Total memory size may be set by the kernel environment variable
856  * hw.physmem or the compile-time define MAXMEM.
857  *
858  * XXX first should be vm_paddr_t.
859  */
860 static void
getmemsize(u_int64_t first)861 getmemsize(u_int64_t first)
862 {
863 	int i, physmap_idx, pa_indx, da_indx;
864 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
865 	u_long physmem_start, physmem_tunable, memtest;
866 	pt_entry_t *pte;
867 	quad_t dcons_addr, dcons_size;
868 	int page_counter;
869 
870 	TSENTER();
871 	/*
872 	 * Tell the physical memory allocator about pages used to store
873 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
874 	 */
875 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
876 
877 	bzero(physmap, sizeof(physmap));
878 	physmap_idx = 0;
879 
880 	init_ops.parse_memmap(physmap, &physmap_idx);
881 	physmap_idx -= 2;
882 
883 	/*
884 	 * Find the 'base memory' segment for SMP
885 	 */
886 	basemem = 0;
887 	for (i = 0; i <= physmap_idx; i += 2) {
888 		if (physmap[i] <= 0xA0000) {
889 			basemem = physmap[i + 1] / 1024;
890 			break;
891 		}
892 	}
893 	if (basemem == 0 || basemem > 640) {
894 		if (bootverbose)
895 			printf(
896 		"Memory map doesn't contain a basemem segment, faking it");
897 		basemem = 640;
898 	}
899 
900 	/*
901 	 * Maxmem isn't the "maximum memory", it's one larger than the
902 	 * highest page of the physical address space.  It should be
903 	 * called something like "Maxphyspage".  We may adjust this
904 	 * based on ``hw.physmem'' and the results of the memory test.
905 	 */
906 	Maxmem = atop(physmap[physmap_idx + 1]);
907 
908 #ifdef MAXMEM
909 	Maxmem = MAXMEM / 4;
910 #endif
911 
912 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
913 		Maxmem = atop(physmem_tunable);
914 
915 	/*
916 	 * The boot memory test is disabled by default, as it takes a
917 	 * significant amount of time on large-memory systems, and is
918 	 * unfriendly to virtual machines as it unnecessarily touches all
919 	 * pages.
920 	 *
921 	 * A general name is used as the code may be extended to support
922 	 * additional tests beyond the current "page present" test.
923 	 */
924 	memtest = 0;
925 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
926 
927 	/*
928 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
929 	 * in the system.
930 	 */
931 	if (Maxmem > atop(physmap[physmap_idx + 1]))
932 		Maxmem = atop(physmap[physmap_idx + 1]);
933 
934 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
935 	    (boothowto & RB_VERBOSE))
936 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
937 
938 	/* call pmap initialization to make new kernel address space */
939 	pmap_bootstrap(&first);
940 
941 	/*
942 	 * Size up each available chunk of physical memory.
943 	 *
944 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
945 	 * By default, mask off the first 16 pages unless we appear to be
946 	 * running in a VM.
947 	 */
948 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
949 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
950 	if (physmap[0] < physmem_start) {
951 		if (physmem_start < PAGE_SIZE)
952 			physmap[0] = PAGE_SIZE;
953 		else if (physmem_start >= physmap[1])
954 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
955 		else
956 			physmap[0] = round_page(physmem_start);
957 	}
958 	pa_indx = 0;
959 	da_indx = 1;
960 	phys_avail[pa_indx++] = physmap[0];
961 	phys_avail[pa_indx] = physmap[0];
962 	dump_avail[da_indx] = physmap[0];
963 	pte = CMAP1;
964 
965 	/*
966 	 * Get dcons buffer address
967 	 */
968 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
969 	    getenv_quad("dcons.size", &dcons_size) == 0)
970 		dcons_addr = 0;
971 
972 	/*
973 	 * physmap is in bytes, so when converting to page boundaries,
974 	 * round up the start address and round down the end address.
975 	 */
976 	page_counter = 0;
977 	if (memtest != 0)
978 		printf("Testing system memory");
979 	for (i = 0; i <= physmap_idx; i += 2) {
980 		vm_paddr_t end;
981 
982 		end = ptoa((vm_paddr_t)Maxmem);
983 		if (physmap[i + 1] < end)
984 			end = trunc_page(physmap[i + 1]);
985 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
986 			int *ptr = (int *)CADDR1;
987 			int tmp;
988 			bool full, page_bad;
989 
990 			full = false;
991 			/*
992 			 * block out kernel memory as not available.
993 			 */
994 			if (pa >= (vm_paddr_t)kernphys && pa < first)
995 				goto do_dump_avail;
996 
997 			/*
998 			 * block out dcons buffer
999 			 */
1000 			if (dcons_addr > 0
1001 			    && pa >= trunc_page(dcons_addr)
1002 			    && pa < dcons_addr + dcons_size)
1003 				goto do_dump_avail;
1004 
1005 			page_bad = false;
1006 			if (memtest == 0)
1007 				goto skip_memtest;
1008 
1009 			/*
1010 			 * Print a "." every GB to show we're making
1011 			 * progress.
1012 			 */
1013 			page_counter++;
1014 			if ((page_counter % PAGES_PER_GB) == 0)
1015 				printf(".");
1016 
1017 			/*
1018 			 * map page into kernel: valid, read/write,non-cacheable
1019 			 */
1020 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1021 			invltlb();
1022 
1023 			tmp = *(int *)ptr;
1024 			/*
1025 			 * Test for alternating 1's and 0's
1026 			 */
1027 			*(volatile int *)ptr = 0xaaaaaaaa;
1028 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1029 				page_bad = true;
1030 			/*
1031 			 * Test for alternating 0's and 1's
1032 			 */
1033 			*(volatile int *)ptr = 0x55555555;
1034 			if (*(volatile int *)ptr != 0x55555555)
1035 				page_bad = true;
1036 			/*
1037 			 * Test for all 1's
1038 			 */
1039 			*(volatile int *)ptr = 0xffffffff;
1040 			if (*(volatile int *)ptr != 0xffffffff)
1041 				page_bad = true;
1042 			/*
1043 			 * Test for all 0's
1044 			 */
1045 			*(volatile int *)ptr = 0x0;
1046 			if (*(volatile int *)ptr != 0x0)
1047 				page_bad = true;
1048 			/*
1049 			 * Restore original value.
1050 			 */
1051 			*(int *)ptr = tmp;
1052 
1053 skip_memtest:
1054 			/*
1055 			 * Adjust array of valid/good pages.
1056 			 */
1057 			if (page_bad == true)
1058 				continue;
1059 			/*
1060 			 * If this good page is a continuation of the
1061 			 * previous set of good pages, then just increase
1062 			 * the end pointer. Otherwise start a new chunk.
1063 			 * Note that "end" points one higher than end,
1064 			 * making the range >= start and < end.
1065 			 * If we're also doing a speculative memory
1066 			 * test and we at or past the end, bump up Maxmem
1067 			 * so that we keep going. The first bad page
1068 			 * will terminate the loop.
1069 			 */
1070 			if (phys_avail[pa_indx] == pa) {
1071 				phys_avail[pa_indx] += PAGE_SIZE;
1072 			} else {
1073 				pa_indx++;
1074 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1075 					printf(
1076 		"Too many holes in the physical address space, giving up\n");
1077 					pa_indx--;
1078 					full = true;
1079 					goto do_dump_avail;
1080 				}
1081 				phys_avail[pa_indx++] = pa;	/* start */
1082 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1083 			}
1084 			physmem++;
1085 do_dump_avail:
1086 			if (dump_avail[da_indx] == pa) {
1087 				dump_avail[da_indx] += PAGE_SIZE;
1088 			} else {
1089 				da_indx++;
1090 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1091 					da_indx--;
1092 					goto do_next;
1093 				}
1094 				dump_avail[da_indx++] = pa; /* start */
1095 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1096 			}
1097 do_next:
1098 			if (full)
1099 				break;
1100 		}
1101 	}
1102 	*pte = 0;
1103 	invltlb();
1104 	if (memtest != 0)
1105 		printf("\n");
1106 
1107 	/*
1108 	 * XXX
1109 	 * The last chunk must contain at least one page plus the message
1110 	 * buffer to avoid complicating other code (message buffer address
1111 	 * calculation, etc.).
1112 	 */
1113 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1114 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1115 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1116 		phys_avail[pa_indx--] = 0;
1117 		phys_avail[pa_indx--] = 0;
1118 	}
1119 
1120 	Maxmem = atop(phys_avail[pa_indx]);
1121 
1122 	/* Trim off space for the message buffer. */
1123 	phys_avail[pa_indx] -= round_page(msgbufsize);
1124 
1125 	/* Map the message buffer. */
1126 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1127 	TSEXIT();
1128 }
1129 
1130 static void
native_parse_preload_data(u_int64_t modulep)1131 native_parse_preload_data(u_int64_t modulep)
1132 {
1133 	char *envp;
1134 #ifdef DDB
1135 	vm_offset_t ksym_start;
1136 	vm_offset_t ksym_end;
1137 #endif
1138 
1139 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1140 	preload_bootstrap_relocate(KERNBASE);
1141 	preload_initkmdp(true);
1142 	boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1143 	envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1144 	if (envp != NULL)
1145 		envp += KERNBASE;
1146 	init_static_kenv(envp, 0);
1147 #ifdef DDB
1148 	ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1149 	ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1150 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1151 #endif
1152 	efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1153 	    vm_paddr_t);
1154 }
1155 
1156 static void
native_clock_source_init(void)1157 native_clock_source_init(void)
1158 {
1159 	i8254_init();
1160 }
1161 
1162 static void
amd64_kdb_init(void)1163 amd64_kdb_init(void)
1164 {
1165 	kdb_init();
1166 #ifdef KDB
1167 	if (boothowto & RB_KDB)
1168 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1169 #endif
1170 }
1171 
1172 /* Set up the fast syscall stuff */
1173 void
amd64_conf_fast_syscall(void)1174 amd64_conf_fast_syscall(void)
1175 {
1176 	uint64_t msr;
1177 
1178 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1179 	wrmsr(MSR_EFER, msr);
1180 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1181 	    (u_int64_t)IDTVEC(fast_syscall));
1182 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1183 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1184 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1185 	wrmsr(MSR_STAR, msr);
1186 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1187 }
1188 
1189 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1190 amd64_bsp_pcpu_init1(struct pcpu *pc)
1191 {
1192 	struct user_segment_descriptor *gdt;
1193 
1194 	PCPU_SET(prvspace, pc);
1195 	gdt = *PCPU_PTR(gdt);
1196 	PCPU_SET(curthread, &thread0);
1197 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1198 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1199 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1200 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1201 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1202 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1203 	PCPU_SET(smp_tlb_gen, 1);
1204 }
1205 
1206 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1207 amd64_bsp_pcpu_init2(uint64_t rsp0)
1208 {
1209 
1210 	PCPU_SET(rsp0, rsp0);
1211 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1212 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1213 	PCPU_SET(curpcb, thread0.td_pcb);
1214 }
1215 
1216 void
amd64_bsp_ist_init(struct pcpu * pc)1217 amd64_bsp_ist_init(struct pcpu *pc)
1218 {
1219 	struct nmi_pcpu *np;
1220 	struct amd64tss *tssp;
1221 
1222 	tssp = &pc->pc_common_tss;
1223 
1224 	/* doublefault stack space, runs on ist1 */
1225 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1226 	np->np_pcpu = (register_t)pc;
1227 	tssp->tss_ist1 = (long)np;
1228 
1229 	/*
1230 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1231 	 * above the start of the ist2 stack.
1232 	 */
1233 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1234 	np->np_pcpu = (register_t)pc;
1235 	tssp->tss_ist2 = (long)np;
1236 
1237 	/*
1238 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1239 	 * above the start of the ist3 stack.
1240 	 */
1241 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1242 	np->np_pcpu = (register_t)pc;
1243 	tssp->tss_ist3 = (long)np;
1244 
1245 	/*
1246 	 * DB# stack, runs on ist4.
1247 	 */
1248 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1249 	np->np_pcpu = (register_t)pc;
1250 	tssp->tss_ist4 = (long)np;
1251 }
1252 
1253 /*
1254  * Calculate the kernel load address by inspecting page table created by loader.
1255  * The assumptions:
1256  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1257  *   aligned at 2M, below 4G (the latter is important for AP startup)
1258  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1259  * - kernel is mapped with 2M superpages
1260  * - all participating memory, i.e. kernel, modules, metadata,
1261  *   page table is accessible by pre-created 1:1 mapping
1262  *   (right now loader creates 1:1 mapping for lower 4G, and all
1263  *   memory is from there)
1264  * - there is a usable memory block right after the end of the
1265  *   mapped kernel and all modules/metadata, pointed to by
1266  *   physfree, for early allocations
1267  */
1268 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1269 amd64_loadaddr(void)
1270 {
1271 	pml4_entry_t *pml4e;
1272 	pdp_entry_t *pdpe;
1273 	pd_entry_t *pde;
1274 	uint64_t cr3;
1275 
1276 	cr3 = rcr3();
1277 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1278 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1279 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1280 	return (*pde & PG_FRAME);
1281 }
1282 
1283 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1284 hammer_time(u_int64_t modulep, u_int64_t physfree)
1285 {
1286 	int gsel_tss, x;
1287 	struct pcpu *pc;
1288 	uint64_t rsp0;
1289 	char *env;
1290 	struct user_segment_descriptor *gdt;
1291 	struct region_descriptor r_gdt;
1292 	size_t kstack0_sz;
1293 
1294 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1295 
1296 	kernphys = amd64_loadaddr();
1297 
1298 	physfree += kernphys;
1299 
1300 	/* Initializes preload_kmdp */
1301 	init_ops.parse_preload_data(modulep);
1302 
1303 	efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1304 	    MODINFOMD_EFI_MAP) != NULL;
1305 
1306 	if (!efi_boot) {
1307 		/* Tell the bios to warmboot next time */
1308 		atomic_store_short((u_short *)0x472, 0x1234);
1309 	}
1310 
1311 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1312 	physfree = roundup2(physfree, PAGE_SIZE);
1313 
1314 	identify_cpu1();
1315 	identify_hypervisor();
1316 	identify_hypervisor_smbios();
1317 	identify_cpu_fixup_bsp();
1318 	identify_cpu2();
1319 	initializecpucache();
1320 
1321 	/*
1322 	 * Check for pti, pcid, and invpcid before ifuncs are
1323 	 * resolved, to correctly select the implementation for
1324 	 * pmap_activate_sw_mode().
1325 	 */
1326 	pti = pti_get_default();
1327 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1328 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1329 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1330 		invpcid_works = (cpu_stdext_feature &
1331 		    CPUID_STDEXT_INVPCID) != 0;
1332 	} else {
1333 		pmap_pcid_enabled = 0;
1334 	}
1335 
1336 	/*
1337 	 * Now we can do small core initialization, after the PCID
1338 	 * CPU features and user knobs are evaluated.
1339 	 */
1340 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1341 	    &pmap_pcid_invlpg_workaround_uena);
1342 	cpu_init_small_core();
1343 
1344 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1345 		use_xsave = 1;
1346 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1347 	}
1348 
1349 	link_elf_ireloc();
1350 
1351 	/*
1352 	 * This may be done better later if it gets more high level
1353 	 * components in it. If so just link td->td_proc here.
1354 	 */
1355 	proc_linkup0(&proc0, &thread0);
1356 
1357 	/* Init basic tunables, hz etc */
1358 	init_param1();
1359 
1360 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1361 	thread0.td_kstack_pages = kstack_pages;
1362 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1363 	bzero((void *)thread0.td_kstack, kstack0_sz);
1364 	physfree += kstack0_sz;
1365 
1366 	/*
1367 	 * Initialize enough of thread0 for delayed invalidation to
1368 	 * work very early.  Rely on thread0.td_base_pri
1369 	 * zero-initialization, it is reset to PVM at proc0_init().
1370 	 */
1371 	pmap_thread_init_invl_gen(&thread0);
1372 
1373 	pc = &temp_bsp_pcpu;
1374 	pcpu_init(pc, 0, sizeof(struct pcpu));
1375 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1376 
1377 	/*
1378 	 * make gdt memory segments
1379 	 */
1380 	for (x = 0; x < NGDT; x++) {
1381 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1382 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1383 			ssdtosd(&gdt_segs[x], &gdt[x]);
1384 	}
1385 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1386 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1387 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1388 
1389 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1390 	r_gdt.rd_base = (long)gdt;
1391 	lgdt(&r_gdt);
1392 
1393 	wrmsr(MSR_FSBASE, 0);		/* User value */
1394 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1395 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1396 
1397 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1398 	physfree += DPCPU_SIZE;
1399 	amd64_bsp_pcpu_init1(pc);
1400 	/* Non-late cninit() and printf() can be moved up to here. */
1401 
1402 	/*
1403 	 * Initialize mutexes.
1404 	 *
1405 	 * icu_lock: in order to allow an interrupt to occur in a critical
1406 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1407 	 *	     must be able to get the icu lock, so it can't be
1408 	 *	     under witness.
1409 	 */
1410 	mutex_init();
1411 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1412 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1413 
1414 	/* exceptions */
1415 	for (x = 0; x < NIDT; x++)
1416 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1417 		    SEL_KPL, 0);
1418 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1419 	    SEL_KPL, 0);
1420 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1421 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1422 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1423 	    SEL_UPL, 0);
1424 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1425 	    SEL_UPL, 0);
1426 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1427 	    SEL_KPL, 0);
1428 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1429 	    SEL_KPL, 0);
1430 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1431 	    SEL_KPL, 0);
1432 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1433 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1434 	    SDT_SYSIGT, SEL_KPL, 0);
1435 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1436 	    SEL_KPL, 0);
1437 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1438 	    SDT_SYSIGT, SEL_KPL, 0);
1439 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1440 	    SEL_KPL, 0);
1441 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1442 	    SEL_KPL, 0);
1443 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1444 	    SEL_KPL, 0);
1445 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1446 	    SEL_KPL, 0);
1447 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1448 	    SEL_KPL, 0);
1449 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1450 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 #ifdef KDTRACE_HOOKS
1453 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1454 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1455 #endif
1456 #ifdef XENHVM
1457 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1458 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1459 #endif
1460 	r_idt.rd_limit = sizeof(idt0) - 1;
1461 	r_idt.rd_base = (long) idt;
1462 	lidt(&r_idt);
1463 
1464 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1465 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1466 
1467 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1468 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1469 
1470 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1471 	    &syscall_ret_l1d_flush_mode);
1472 
1473 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1474 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1475 
1476 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1477 
1478 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1479 	    &x86_rngds_mitg_enable);
1480 
1481 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1482 	    &zenbleed_enable);
1483 	zenbleed_sanitize_enable();
1484 
1485 	finishidentcpu();	/* Final stage of CPU initialization */
1486 
1487 	invlpgb_works = (amd_extended_feature_extensions &
1488 	    AMDFEID_INVLPGB) != 0;
1489 	TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1490 	if (invlpgb_works)
1491 		invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1492 
1493 	/*
1494 	 * Initialize the clock before the console so that console
1495 	 * initialization can use DELAY().
1496 	 */
1497 	clock_init();
1498 
1499 	initializecpu();	/* Initialize CPU registers */
1500 
1501 	amd64_bsp_ist_init(pc);
1502 
1503 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1504 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1505 	    IOPERM_BITMAP_SIZE;
1506 
1507 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1508 	ltr(gsel_tss);
1509 
1510 	amd64_conf_fast_syscall();
1511 
1512 	/*
1513 	 * We initialize the PCB pointer early so that exception
1514 	 * handlers will work.  Also set up td_critnest to short-cut
1515 	 * the page fault handler.
1516 	 */
1517 	cpu_max_ext_state_size = sizeof(struct savefpu);
1518 	set_top_of_stack_td(&thread0);
1519 	thread0.td_pcb = get_pcb_td(&thread0);
1520 	thread0.td_critnest = 1;
1521 
1522 	/*
1523 	 * The console and kdb should be initialized even earlier than here,
1524 	 * but some console drivers don't work until after getmemsize().
1525 	 * Default to late console initialization to support these drivers.
1526 	 * This loses mainly printf()s in getmemsize() and early debugging.
1527 	 */
1528 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1529 	if (!late_console) {
1530 		cninit();
1531 		amd64_kdb_init();
1532 	}
1533 
1534 	getmemsize(physfree);
1535 	init_param2(physmem);
1536 
1537 	/* now running on new page tables, configured,and u/iom is accessible */
1538 
1539 #ifdef DEV_PCI
1540         /* This call might adjust phys_avail[]. */
1541         pci_early_quirks();
1542 #endif
1543 
1544 	if (late_console)
1545 		cninit();
1546 
1547 	/*
1548 	 * Dump the boot metadata. We have to wait for cninit() since console
1549 	 * output is required. If it's grossly incorrect the kernel will never
1550 	 * make it this far.
1551 	 */
1552 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1553 		preload_dump();
1554 
1555 #ifdef DEV_ISA
1556 #ifdef DEV_ATPIC
1557 	elcr_probe();
1558 	atpic_startup();
1559 #else
1560 	/* Reset and mask the atpics and leave them shut down. */
1561 	atpic_reset();
1562 
1563 	/*
1564 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1565 	 * interrupt handler.
1566 	 */
1567 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1568 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1569 #endif
1570 #else
1571 #error "have you forgotten the isa device?"
1572 #endif
1573 
1574 	if (late_console)
1575 		amd64_kdb_init();
1576 
1577 	msgbufinit(msgbufp, msgbufsize);
1578 	fpuinit();
1579 
1580 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1581 	rsp0 = thread0.td_md.md_stack_base;
1582 	/* Ensure the stack is aligned to 16 bytes */
1583 	rsp0 &= ~0xFul;
1584 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1585 	amd64_bsp_pcpu_init2(rsp0);
1586 
1587 	/* transfer to user mode */
1588 
1589 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1590 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1591 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1592 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1593 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1594 
1595 	load_ds(_udatasel);
1596 	load_es(_udatasel);
1597 	load_fs(_ufssel);
1598 
1599 	/* setup proc 0's pcb */
1600 	thread0.td_pcb->pcb_flags = 0;
1601 
1602         env = kern_getenv("kernelname");
1603 	if (env != NULL)
1604 		strlcpy(kernelname, env, sizeof(kernelname));
1605 
1606 	kcsan_cpu_init(0);
1607 
1608 #ifdef FDT
1609 	x86_init_fdt();
1610 #endif
1611 	thread0.td_critnest = 0;
1612 
1613 	kasan_init();
1614 	kmsan_init();
1615 
1616 	TSEXIT();
1617 
1618 	/* Location of kernel stack for locore */
1619 	return (thread0.td_md.md_stack_base);
1620 }
1621 
1622 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1623 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1624 {
1625 
1626 	pcpu->pc_acpi_id = 0xffffffff;
1627 }
1628 
1629 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1630 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1631 {
1632 	struct bios_smap *smapbase;
1633 	struct bios_smap_xattr smap;
1634 	uint32_t *smapattr;
1635 	int count, error, i;
1636 
1637 	/* Retrieve the system memory map from the loader. */
1638 	smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1639 	    MODINFO_METADATA | MODINFOMD_SMAP);
1640 	if (smapbase == NULL)
1641 		return (0);
1642 	smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1643 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1644 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1645 	error = 0;
1646 	for (i = 0; i < count; i++) {
1647 		smap.base = smapbase[i].base;
1648 		smap.length = smapbase[i].length;
1649 		smap.type = smapbase[i].type;
1650 		if (smapattr != NULL)
1651 			smap.xattr = smapattr[i];
1652 		else
1653 			smap.xattr = 0;
1654 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1655 	}
1656 	return (error);
1657 }
1658 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1659     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1660     smap_sysctl_handler, "S,bios_smap_xattr",
1661     "Raw BIOS SMAP data");
1662 
1663 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1664 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1665 {
1666 	struct efi_map_header *efihdr;
1667 	uint32_t efisize;
1668 
1669 	efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1670 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1671 	if (efihdr == NULL)
1672 		return (0);
1673 	efisize = *((uint32_t *)efihdr - 1);
1674 	return (SYSCTL_OUT(req, efihdr, efisize));
1675 }
1676 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1677     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1678     efi_map_sysctl_handler, "S,efi_map_header",
1679     "Raw EFI Memory Map");
1680 
1681 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1682 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1683 {
1684 	char *arch;
1685 
1686 	arch = (char *)preload_search_info(preload_kmdp,
1687 	    MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1688 	if (arch == NULL)
1689 		return (0);
1690 
1691 	return (SYSCTL_OUT_STR(req, arch));
1692 }
1693 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1694     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1695     efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1696 
1697 void
spinlock_enter(void)1698 spinlock_enter(void)
1699 {
1700 	struct thread *td;
1701 	register_t flags;
1702 
1703 	td = curthread;
1704 	if (td->td_md.md_spinlock_count == 0) {
1705 		flags = intr_disable();
1706 		td->td_md.md_spinlock_count = 1;
1707 		td->td_md.md_saved_flags = flags;
1708 		critical_enter();
1709 	} else
1710 		td->td_md.md_spinlock_count++;
1711 }
1712 
1713 void
spinlock_exit(void)1714 spinlock_exit(void)
1715 {
1716 	struct thread *td;
1717 	register_t flags;
1718 
1719 	td = curthread;
1720 	flags = td->td_md.md_saved_flags;
1721 	td->td_md.md_spinlock_count--;
1722 	if (td->td_md.md_spinlock_count == 0) {
1723 		critical_exit();
1724 		intr_restore(flags);
1725 	}
1726 }
1727 
1728 /*
1729  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1730  * we want to start a backtrace from the function that caused us to enter
1731  * the debugger. We have the context in the trapframe, but base the trace
1732  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1733  * enough for a backtrace.
1734  */
1735 void
makectx(struct trapframe * tf,struct pcb * pcb)1736 makectx(struct trapframe *tf, struct pcb *pcb)
1737 {
1738 
1739 	pcb->pcb_r12 = tf->tf_r12;
1740 	pcb->pcb_r13 = tf->tf_r13;
1741 	pcb->pcb_r14 = tf->tf_r14;
1742 	pcb->pcb_r15 = tf->tf_r15;
1743 	pcb->pcb_rbp = tf->tf_rbp;
1744 	pcb->pcb_rbx = tf->tf_rbx;
1745 	pcb->pcb_rip = tf->tf_rip;
1746 	pcb->pcb_rsp = tf->tf_rsp;
1747 }
1748 
1749 /*
1750  * The pcb_flags is only modified by current thread, or by other threads
1751  * when current thread is stopped.  However, current thread may change it
1752  * from the interrupt context in cpu_switch(), or in the trap handler.
1753  * When we read-modify-write pcb_flags from C sources, compiler may generate
1754  * code that is not atomic regarding the interrupt handler.  If a trap or
1755  * interrupt happens and any flag is modified from the handler, it can be
1756  * clobbered with the cached value later.  Therefore, we implement setting
1757  * and clearing flags with single-instruction functions, which do not race
1758  * with possible modification of the flags from the trap or interrupt context,
1759  * because traps and interrupts are executed only on instruction boundary.
1760  */
1761 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1762 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1763 {
1764 
1765 	__asm __volatile("orl %1,%0"
1766 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1767 	    : "cc", "memory");
1768 
1769 }
1770 
1771 /*
1772  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1773  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1774  * pcb if user space modified the bases.  We must save on the context
1775  * switch or if the return to usermode happens through the doreti.
1776  *
1777  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1778  * which have a consequence that the base MSRs must be saved each time
1779  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1780  * context switches.
1781  */
1782 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1783 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1784 {
1785 	register_t r;
1786 
1787 	if (curpcb == pcb &&
1788 	    (flags & PCB_FULL_IRET) != 0 &&
1789 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1790 		r = intr_disable();
1791 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1792 			if (rfs() == _ufssel)
1793 				pcb->pcb_fsbase = rdfsbase();
1794 			if (rgs() == _ugssel)
1795 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1796 		}
1797 		set_pcb_flags_raw(pcb, flags);
1798 		intr_restore(r);
1799 	} else {
1800 		set_pcb_flags_raw(pcb, flags);
1801 	}
1802 }
1803 
1804 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1805 {
1806 
1807 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1808 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1809 }
1810 
1811 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1812 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1813 {
1814 
1815 	__asm __volatile("andl %1,%0"
1816 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1817 	    : "cc", "memory");
1818 }
1819 
1820 #ifdef KDB
1821 
1822 /*
1823  * Provide inb() and outb() as functions.  They are normally only available as
1824  * inline functions, thus cannot be called from the debugger.
1825  */
1826 
1827 /* silence compiler warnings */
1828 u_char inb_(u_short);
1829 void outb_(u_short, u_char);
1830 
1831 u_char
inb_(u_short port)1832 inb_(u_short port)
1833 {
1834 	return inb(port);
1835 }
1836 
1837 void
outb_(u_short port,u_char data)1838 outb_(u_short port, u_char data)
1839 {
1840 	outb(port, data);
1841 }
1842 
1843 #endif /* KDB */
1844 
1845 #undef memset
1846 #undef memmove
1847 #undef memcpy
1848 
1849 void	*memset_std(void *buf, int c, size_t len);
1850 void	*memset_erms(void *buf, int c, size_t len);
1851 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1852 	    size_t len);
1853 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1854 	    size_t len);
1855 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1856 	    size_t len);
1857 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1858 	    size_t len);
1859 
1860 #ifdef KCSAN
1861 /*
1862  * These fail to build as ifuncs when used with KCSAN.
1863  */
1864 void *
memset(void * buf,int c,size_t len)1865 memset(void *buf, int c, size_t len)
1866 {
1867 
1868 	return (memset_std(buf, c, len));
1869 }
1870 
1871 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1872 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1873 {
1874 
1875 	return (memmove_std(dst, src, len));
1876 }
1877 
1878 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1879 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1880 {
1881 
1882 	return (memcpy_std(dst, src, len));
1883 }
1884 #else
1885 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1886 {
1887 
1888 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1889 	    memset_erms : memset_std);
1890 }
1891 
1892 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1893     size_t))
1894 {
1895 
1896 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1897 	    memmove_erms : memmove_std);
1898 }
1899 
1900 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1901 {
1902 
1903 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1904 	    memcpy_erms : memcpy_std);
1905 }
1906 #endif
1907 
1908 void	pagezero_std(void *addr);
1909 void	pagezero_erms(void *addr);
1910 DEFINE_IFUNC(, void , pagezero, (void *))
1911 {
1912 
1913 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1914 	    pagezero_erms : pagezero_std);
1915 }
1916