1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113
114 #include <net/netisr.h>
115
116 #include <dev/smbios/smbios.h>
117
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155 /*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170
171 /* Preload data parse function */
172 static void native_parse_preload_data(u_int64_t);
173
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(vm_paddr_t *, int *);
176
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 .parse_preload_data = native_parse_preload_data,
180 .early_clock_source_init = native_clock_source_init,
181 .early_delay = i8254_delay,
182 .parse_memmap = native_parse_memmap,
183 };
184
185 /*
186 * Physical address of the EFI System Table. Stashed from the metadata hints
187 * passed into the kernel and used by the EFI code to call runtime services.
188 */
189 vm_paddr_t efi_systbl_phys;
190
191 /* Intel ICH registers */
192 #define ICH_PMBASE 0x400
193 #define ICH_SMI_EN ICH_PMBASE + 0x30
194
195 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
196
197 int cold = 1;
198
199 long Maxmem = 0;
200 long realmem = 0;
201 int late_console = 1;
202
203 struct kva_md_info kmi;
204
205 struct region_descriptor r_idt;
206
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209
210 struct mtx icu_lock;
211
212 struct mem_range_softc mem_range_softc;
213
214 struct mtx dt_lock; /* lock for GDT and LDT */
215
216 void (*vmm_suspend_p)(void);
217 void (*vmm_resume_p)(void);
218
219 bool efi_boot;
220
221 static void
cpu_startup(void * dummy)222 cpu_startup(void *dummy)
223 {
224 uintmax_t memsize;
225 char *sysenv;
226
227 /*
228 * On MacBooks, we need to disallow the legacy USB circuit to
229 * generate an SMI# because this can cause several problems,
230 * namely: incorrect CPU frequency detection and failure to
231 * start the APs.
232 * We do this by disabling a bit in the SMI_EN (SMI Control and
233 * Enable register) of the Intel ICH LPC Interface Bridge.
234 */
235 sysenv = kern_getenv("smbios.system.product");
236 if (sysenv != NULL) {
237 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
238 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
239 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
240 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
241 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
242 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
243 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
244 strncmp(sysenv, "Macmini1,1", 10) == 0) {
245 if (bootverbose)
246 printf("Disabling LEGACY_USB_EN bit on "
247 "Intel ICH.\n");
248 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
249 }
250 freeenv(sysenv);
251 }
252
253 /*
254 * Good {morning,afternoon,evening,night}.
255 */
256 startrtclock();
257 printcpuinfo();
258
259 /*
260 * Display physical memory if SMBIOS reports reasonable amount.
261 */
262 memsize = 0;
263 sysenv = kern_getenv("smbios.memory.enabled");
264 if (sysenv != NULL) {
265 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
266 freeenv(sysenv);
267 }
268 if (memsize < ptoa((uintmax_t)vm_free_count()))
269 memsize = ptoa((uintmax_t)Maxmem);
270 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
271 realmem = atop(memsize);
272
273 /*
274 * Display any holes after the first chunk of extended memory.
275 */
276 if (bootverbose) {
277 int indx;
278
279 printf("Physical memory chunk(s):\n");
280 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
281 vm_paddr_t size;
282
283 size = phys_avail[indx + 1] - phys_avail[indx];
284 printf(
285 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
286 (uintmax_t)phys_avail[indx],
287 (uintmax_t)phys_avail[indx + 1] - 1,
288 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
289 }
290 }
291
292 vm_ksubmap_init(&kmi);
293
294 printf("avail memory = %ju (%ju MB)\n",
295 ptoa((uintmax_t)vm_free_count()),
296 ptoa((uintmax_t)vm_free_count()) / 1048576);
297 #ifdef DEV_PCI
298 if (bootverbose && intel_graphics_stolen_base != 0)
299 printf("intel stolen mem: base %#jx size %ju MB\n",
300 (uintmax_t)intel_graphics_stolen_base,
301 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
302 #endif
303
304 /*
305 * Set up buffers, so they can be used to read disk labels.
306 */
307 bufinit();
308 vm_pager_bufferinit();
309
310 cpu_setregs();
311 }
312
313 static void
late_ifunc_resolve(void * dummy __unused)314 late_ifunc_resolve(void *dummy __unused)
315 {
316 link_elf_late_ireloc();
317 }
318 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
319
320
321 void
cpu_setregs(void)322 cpu_setregs(void)
323 {
324 register_t cr0;
325
326 TSENTER();
327 cr0 = rcr0();
328 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
329 TSENTER2("load_cr0");
330 load_cr0(cr0);
331 TSEXIT2("load_cr0");
332 TSEXIT();
333 }
334
335 /*
336 * Initialize amd64 and configure to run kernel
337 */
338
339 /*
340 * Initialize segments & interrupt table
341 */
342 static struct gate_descriptor idt0[NIDT];
343 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
344
345 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
346 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
347 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
348 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
349 CTASSERT(sizeof(struct nmi_pcpu) == 16);
350
351 /*
352 * Software prototypes -- in more palatable form.
353 *
354 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
355 * slots as corresponding segments for i386 kernel.
356 */
357 struct soft_segment_descriptor gdt_segs[] = {
358 [GNULL_SEL] = { /* 0 Null Descriptor */
359 .ssd_base = 0x0,
360 .ssd_limit = 0x0,
361 .ssd_type = 0,
362 .ssd_dpl = 0,
363 .ssd_p = 0,
364 .ssd_long = 0,
365 .ssd_def32 = 0,
366 .ssd_gran = 0 },
367 [GNULL2_SEL] = { /* 1 Null Descriptor */
368 .ssd_base = 0x0,
369 .ssd_limit = 0x0,
370 .ssd_type = 0,
371 .ssd_dpl = 0,
372 .ssd_p = 0,
373 .ssd_long = 0,
374 .ssd_def32 = 0,
375 .ssd_gran = 0 },
376 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
377 .ssd_base = 0x0,
378 .ssd_limit = 0xfffff,
379 .ssd_type = SDT_MEMRWA,
380 .ssd_dpl = SEL_UPL,
381 .ssd_p = 1,
382 .ssd_long = 0,
383 .ssd_def32 = 1,
384 .ssd_gran = 1 },
385 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
386 .ssd_base = 0x0,
387 .ssd_limit = 0xfffff,
388 .ssd_type = SDT_MEMRWA,
389 .ssd_dpl = SEL_UPL,
390 .ssd_p = 1,
391 .ssd_long = 0,
392 .ssd_def32 = 1,
393 .ssd_gran = 1 },
394 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
395 .ssd_base = 0x0,
396 .ssd_limit = 0xfffff,
397 .ssd_type = SDT_MEMERA,
398 .ssd_dpl = SEL_KPL,
399 .ssd_p = 1,
400 .ssd_long = 1,
401 .ssd_def32 = 0,
402 .ssd_gran = 1 },
403 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
404 .ssd_base = 0x0,
405 .ssd_limit = 0xfffff,
406 .ssd_type = SDT_MEMRWA,
407 .ssd_dpl = SEL_KPL,
408 .ssd_p = 1,
409 .ssd_long = 1,
410 .ssd_def32 = 0,
411 .ssd_gran = 1 },
412 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
413 .ssd_base = 0x0,
414 .ssd_limit = 0xfffff,
415 .ssd_type = SDT_MEMERA,
416 .ssd_dpl = SEL_UPL,
417 .ssd_p = 1,
418 .ssd_long = 0,
419 .ssd_def32 = 1,
420 .ssd_gran = 1 },
421 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
422 .ssd_base = 0x0,
423 .ssd_limit = 0xfffff,
424 .ssd_type = SDT_MEMRWA,
425 .ssd_dpl = SEL_UPL,
426 .ssd_p = 1,
427 .ssd_long = 0,
428 .ssd_def32 = 1,
429 .ssd_gran = 1 },
430 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
431 .ssd_base = 0x0,
432 .ssd_limit = 0xfffff,
433 .ssd_type = SDT_MEMERA,
434 .ssd_dpl = SEL_UPL,
435 .ssd_p = 1,
436 .ssd_long = 1,
437 .ssd_def32 = 0,
438 .ssd_gran = 1 },
439 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
440 .ssd_base = 0x0,
441 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
442 .ssd_type = SDT_SYSTSS,
443 .ssd_dpl = SEL_KPL,
444 .ssd_p = 1,
445 .ssd_long = 0,
446 .ssd_def32 = 0,
447 .ssd_gran = 0 },
448 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
449 .ssd_base = 0x0,
450 .ssd_limit = 0x0,
451 .ssd_type = 0,
452 .ssd_dpl = 0,
453 .ssd_p = 0,
454 .ssd_long = 0,
455 .ssd_def32 = 0,
456 .ssd_gran = 0 },
457 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
458 .ssd_base = 0x0,
459 .ssd_limit = 0x0,
460 .ssd_type = 0,
461 .ssd_dpl = 0,
462 .ssd_p = 0,
463 .ssd_long = 0,
464 .ssd_def32 = 0,
465 .ssd_gran = 0 },
466 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
467 .ssd_base = 0x0,
468 .ssd_limit = 0x0,
469 .ssd_type = 0,
470 .ssd_dpl = 0,
471 .ssd_p = 0,
472 .ssd_long = 0,
473 .ssd_def32 = 0,
474 .ssd_gran = 0 },
475 };
476 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
477
478 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)479 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
480 {
481 struct gate_descriptor *ip;
482
483 ip = idt + idx;
484 ip->gd_looffset = (uintptr_t)func;
485 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
486 ip->gd_ist = ist;
487 ip->gd_xx = 0;
488 ip->gd_type = typ;
489 ip->gd_dpl = dpl;
490 ip->gd_p = 1;
491 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
492 }
493
494 extern inthand_t
495 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
496 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
497 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
498 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
499 IDTVEC(xmm), IDTVEC(dblfault),
500 IDTVEC(div_pti), IDTVEC(bpt_pti),
501 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
502 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
503 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
504 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
505 IDTVEC(xmm_pti),
506 #ifdef KDTRACE_HOOKS
507 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
508 #endif
509 #ifdef XENHVM
510 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
511 #endif
512 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
513 IDTVEC(fast_syscall_pti);
514
515 #ifdef DDB
516 /*
517 * Display the index and function name of any IDT entries that don't use
518 * the default 'rsvd' entry point.
519 */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)520 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
521 {
522 struct gate_descriptor *ip;
523 int idx;
524 uintptr_t func;
525
526 ip = idt;
527 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
528 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
529 if (func != (uintptr_t)&IDTVEC(rsvd)) {
530 db_printf("%3d\t", idx);
531 db_printsym(func, DB_STGY_PROC);
532 db_printf("\n");
533 }
534 ip++;
535 }
536 }
537
538 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)539 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
540 {
541 struct {
542 uint16_t limit;
543 uint64_t base;
544 } __packed idtr, gdtr;
545 uint16_t ldt, tr;
546
547 __asm __volatile("sidt %0" : "=m" (idtr));
548 db_printf("idtr\t0x%016lx/%04x\n",
549 (u_long)idtr.base, (u_int)idtr.limit);
550 __asm __volatile("sgdt %0" : "=m" (gdtr));
551 db_printf("gdtr\t0x%016lx/%04x\n",
552 (u_long)gdtr.base, (u_int)gdtr.limit);
553 __asm __volatile("sldt %0" : "=r" (ldt));
554 db_printf("ldtr\t0x%04x\n", ldt);
555 __asm __volatile("str %0" : "=r" (tr));
556 db_printf("tr\t0x%04x\n", tr);
557 db_printf("cr0\t0x%016lx\n", rcr0());
558 db_printf("cr2\t0x%016lx\n", rcr2());
559 db_printf("cr3\t0x%016lx\n", rcr3());
560 db_printf("cr4\t0x%016lx\n", rcr4());
561 if (rcr4() & CR4_XSAVE)
562 db_printf("xcr0\t0x%016lx\n", rxcr(0));
563 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
564 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
565 db_printf("FEATURES_CTL\t%016lx\n",
566 rdmsr(MSR_IA32_FEATURE_CONTROL));
567 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
568 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
569 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
570 }
571
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)572 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
573 {
574
575 db_printf("dr0\t0x%016lx\n", rdr0());
576 db_printf("dr1\t0x%016lx\n", rdr1());
577 db_printf("dr2\t0x%016lx\n", rdr2());
578 db_printf("dr3\t0x%016lx\n", rdr3());
579 db_printf("dr6\t0x%016lx\n", rdr6());
580 db_printf("dr7\t0x%016lx\n", rdr7());
581 }
582 #endif
583
584 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)585 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
586 {
587
588 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
589 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
590 ssd->ssd_type = sd->sd_type;
591 ssd->ssd_dpl = sd->sd_dpl;
592 ssd->ssd_p = sd->sd_p;
593 ssd->ssd_long = sd->sd_long;
594 ssd->ssd_def32 = sd->sd_def32;
595 ssd->ssd_gran = sd->sd_gran;
596 }
597
598 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)599 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
600 {
601
602 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
603 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
604 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
605 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
606 sd->sd_type = ssd->ssd_type;
607 sd->sd_dpl = ssd->ssd_dpl;
608 sd->sd_p = ssd->ssd_p;
609 sd->sd_long = ssd->ssd_long;
610 sd->sd_def32 = ssd->ssd_def32;
611 sd->sd_gran = ssd->ssd_gran;
612 }
613
614 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)615 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
616 {
617
618 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
619 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
620 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
621 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
622 sd->sd_type = ssd->ssd_type;
623 sd->sd_dpl = ssd->ssd_dpl;
624 sd->sd_p = ssd->ssd_p;
625 sd->sd_gran = ssd->ssd_gran;
626 }
627
628 u_int basemem;
629
630 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)631 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
632 int *physmap_idxp)
633 {
634 int i, insert_idx, physmap_idx;
635
636 physmap_idx = *physmap_idxp;
637
638 if (length == 0)
639 return (1);
640
641 /*
642 * Find insertion point while checking for overlap. Start off by
643 * assuming the new entry will be added to the end.
644 *
645 * NB: physmap_idx points to the next free slot.
646 */
647 insert_idx = physmap_idx;
648 for (i = 0; i <= physmap_idx; i += 2) {
649 if (base < physmap[i + 1]) {
650 if (base + length <= physmap[i]) {
651 insert_idx = i;
652 break;
653 }
654 if (boothowto & RB_VERBOSE)
655 printf(
656 "Overlapping memory regions, ignoring second region\n");
657 return (1);
658 }
659 }
660
661 /* See if we can prepend to the next entry. */
662 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
663 physmap[insert_idx] = base;
664 return (1);
665 }
666
667 /* See if we can append to the previous entry. */
668 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
669 physmap[insert_idx - 1] += length;
670 return (1);
671 }
672
673 physmap_idx += 2;
674 *physmap_idxp = physmap_idx;
675 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
676 printf(
677 "Too many segments in the physical address map, giving up\n");
678 return (0);
679 }
680
681 /*
682 * Move the last 'N' entries down to make room for the new
683 * entry if needed.
684 */
685 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
686 physmap[i] = physmap[i - 2];
687 physmap[i + 1] = physmap[i - 1];
688 }
689
690 /* Insert the new entry. */
691 physmap[insert_idx] = base;
692 physmap[insert_idx + 1] = base + length;
693 return (1);
694 }
695
696 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)697 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
698 vm_paddr_t *physmap, int *physmap_idx)
699 {
700 struct bios_smap *smap, *smapend;
701
702 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
703
704 for (smap = smapbase; smap < smapend; smap++) {
705 if (boothowto & RB_VERBOSE)
706 printf("SMAP type=%02x base=%016lx len=%016lx\n",
707 smap->type, smap->base, smap->length);
708
709 if (smap->type != SMAP_TYPE_MEMORY)
710 continue;
711
712 if (!add_physmap_entry(smap->base, smap->length, physmap,
713 physmap_idx))
714 break;
715 }
716 }
717
718 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)719 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
720 int *physmap_idx)
721 {
722 struct efi_md *map, *p;
723 const char *type;
724 size_t efisz;
725 int ndesc, i;
726
727 static const char *types[] = {
728 "Reserved",
729 "LoaderCode",
730 "LoaderData",
731 "BootServicesCode",
732 "BootServicesData",
733 "RuntimeServicesCode",
734 "RuntimeServicesData",
735 "ConventionalMemory",
736 "UnusableMemory",
737 "ACPIReclaimMemory",
738 "ACPIMemoryNVS",
739 "MemoryMappedIO",
740 "MemoryMappedIOPortSpace",
741 "PalCode",
742 "PersistentMemory"
743 };
744
745 /*
746 * Memory map data provided by UEFI via the GetMemoryMap
747 * Boot Services API.
748 */
749 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
750 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
751
752 if (efihdr->descriptor_size == 0)
753 return;
754 ndesc = efihdr->memory_size / efihdr->descriptor_size;
755
756 if (boothowto & RB_VERBOSE)
757 printf("%23s %12s %12s %8s %4s\n",
758 "Type", "Physical", "Virtual", "#Pages", "Attr");
759
760 for (i = 0, p = map; i < ndesc; i++,
761 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
762 if (boothowto & RB_VERBOSE) {
763 if (p->md_type < nitems(types))
764 type = types[p->md_type];
765 else
766 type = "<INVALID>";
767 printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
768 p->md_virt, p->md_pages);
769 if (p->md_attr & EFI_MD_ATTR_UC)
770 printf("UC ");
771 if (p->md_attr & EFI_MD_ATTR_WC)
772 printf("WC ");
773 if (p->md_attr & EFI_MD_ATTR_WT)
774 printf("WT ");
775 if (p->md_attr & EFI_MD_ATTR_WB)
776 printf("WB ");
777 if (p->md_attr & EFI_MD_ATTR_UCE)
778 printf("UCE ");
779 if (p->md_attr & EFI_MD_ATTR_WP)
780 printf("WP ");
781 if (p->md_attr & EFI_MD_ATTR_RP)
782 printf("RP ");
783 if (p->md_attr & EFI_MD_ATTR_XP)
784 printf("XP ");
785 if (p->md_attr & EFI_MD_ATTR_NV)
786 printf("NV ");
787 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
788 printf("MORE_RELIABLE ");
789 if (p->md_attr & EFI_MD_ATTR_RO)
790 printf("RO ");
791 if (p->md_attr & EFI_MD_ATTR_RT)
792 printf("RUNTIME");
793 printf("\n");
794 }
795
796 switch (p->md_type) {
797 case EFI_MD_TYPE_CODE:
798 case EFI_MD_TYPE_DATA:
799 case EFI_MD_TYPE_BS_CODE:
800 case EFI_MD_TYPE_BS_DATA:
801 case EFI_MD_TYPE_FREE:
802 /*
803 * We're allowed to use any entry with these types.
804 */
805 break;
806 default:
807 continue;
808 }
809
810 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
811 physmap, physmap_idx))
812 break;
813 }
814 }
815
816 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)817 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
818 {
819 struct bios_smap *smap;
820 struct efi_map_header *efihdr;
821 u_int32_t size;
822
823 /*
824 * Memory map from INT 15:E820.
825 *
826 * subr_module.c says:
827 * "Consumer may safely assume that size value precedes data."
828 * ie: an int32_t immediately precedes smap.
829 */
830
831 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
832 MODINFO_METADATA | MODINFOMD_EFI_MAP);
833 smap = (struct bios_smap *)preload_search_info(preload_kmdp,
834 MODINFO_METADATA | MODINFOMD_SMAP);
835 if (efihdr == NULL && smap == NULL)
836 panic("No BIOS smap or EFI map info from loader!");
837
838 if (efihdr != NULL) {
839 add_efi_map_entries(efihdr, physmap, physmap_idx);
840 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
841 } else {
842 size = *((u_int32_t *)smap - 1);
843 bios_add_smap_entries(smap, size, physmap, physmap_idx);
844 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
845 }
846 }
847
848 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
849
850 /*
851 * Populate the (physmap) array with base/bound pairs describing the
852 * available physical memory in the system, then test this memory and
853 * build the phys_avail array describing the actually-available memory.
854 *
855 * Total memory size may be set by the kernel environment variable
856 * hw.physmem or the compile-time define MAXMEM.
857 *
858 * XXX first should be vm_paddr_t.
859 */
860 static void
getmemsize(u_int64_t first)861 getmemsize(u_int64_t first)
862 {
863 int i, physmap_idx, pa_indx, da_indx;
864 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
865 u_long physmem_start, physmem_tunable, memtest;
866 pt_entry_t *pte;
867 quad_t dcons_addr, dcons_size;
868 int page_counter;
869
870 TSENTER();
871 /*
872 * Tell the physical memory allocator about pages used to store
873 * the kernel and preloaded data. See kmem_bootstrap_free().
874 */
875 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
876
877 bzero(physmap, sizeof(physmap));
878 physmap_idx = 0;
879
880 init_ops.parse_memmap(physmap, &physmap_idx);
881 physmap_idx -= 2;
882
883 /*
884 * Find the 'base memory' segment for SMP
885 */
886 basemem = 0;
887 for (i = 0; i <= physmap_idx; i += 2) {
888 if (physmap[i] <= 0xA0000) {
889 basemem = physmap[i + 1] / 1024;
890 break;
891 }
892 }
893 if (basemem == 0 || basemem > 640) {
894 if (bootverbose)
895 printf(
896 "Memory map doesn't contain a basemem segment, faking it");
897 basemem = 640;
898 }
899
900 /*
901 * Maxmem isn't the "maximum memory", it's one larger than the
902 * highest page of the physical address space. It should be
903 * called something like "Maxphyspage". We may adjust this
904 * based on ``hw.physmem'' and the results of the memory test.
905 */
906 Maxmem = atop(physmap[physmap_idx + 1]);
907
908 #ifdef MAXMEM
909 Maxmem = MAXMEM / 4;
910 #endif
911
912 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
913 Maxmem = atop(physmem_tunable);
914
915 /*
916 * The boot memory test is disabled by default, as it takes a
917 * significant amount of time on large-memory systems, and is
918 * unfriendly to virtual machines as it unnecessarily touches all
919 * pages.
920 *
921 * A general name is used as the code may be extended to support
922 * additional tests beyond the current "page present" test.
923 */
924 memtest = 0;
925 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
926
927 /*
928 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
929 * in the system.
930 */
931 if (Maxmem > atop(physmap[physmap_idx + 1]))
932 Maxmem = atop(physmap[physmap_idx + 1]);
933
934 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
935 (boothowto & RB_VERBOSE))
936 printf("Physical memory use set to %ldK\n", Maxmem * 4);
937
938 /* call pmap initialization to make new kernel address space */
939 pmap_bootstrap(&first);
940
941 /*
942 * Size up each available chunk of physical memory.
943 *
944 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
945 * By default, mask off the first 16 pages unless we appear to be
946 * running in a VM.
947 */
948 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
949 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
950 if (physmap[0] < physmem_start) {
951 if (physmem_start < PAGE_SIZE)
952 physmap[0] = PAGE_SIZE;
953 else if (physmem_start >= physmap[1])
954 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
955 else
956 physmap[0] = round_page(physmem_start);
957 }
958 pa_indx = 0;
959 da_indx = 1;
960 phys_avail[pa_indx++] = physmap[0];
961 phys_avail[pa_indx] = physmap[0];
962 dump_avail[da_indx] = physmap[0];
963 pte = CMAP1;
964
965 /*
966 * Get dcons buffer address
967 */
968 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
969 getenv_quad("dcons.size", &dcons_size) == 0)
970 dcons_addr = 0;
971
972 /*
973 * physmap is in bytes, so when converting to page boundaries,
974 * round up the start address and round down the end address.
975 */
976 page_counter = 0;
977 if (memtest != 0)
978 printf("Testing system memory");
979 for (i = 0; i <= physmap_idx; i += 2) {
980 vm_paddr_t end;
981
982 end = ptoa((vm_paddr_t)Maxmem);
983 if (physmap[i + 1] < end)
984 end = trunc_page(physmap[i + 1]);
985 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
986 int *ptr = (int *)CADDR1;
987 int tmp;
988 bool full, page_bad;
989
990 full = false;
991 /*
992 * block out kernel memory as not available.
993 */
994 if (pa >= (vm_paddr_t)kernphys && pa < first)
995 goto do_dump_avail;
996
997 /*
998 * block out dcons buffer
999 */
1000 if (dcons_addr > 0
1001 && pa >= trunc_page(dcons_addr)
1002 && pa < dcons_addr + dcons_size)
1003 goto do_dump_avail;
1004
1005 page_bad = false;
1006 if (memtest == 0)
1007 goto skip_memtest;
1008
1009 /*
1010 * Print a "." every GB to show we're making
1011 * progress.
1012 */
1013 page_counter++;
1014 if ((page_counter % PAGES_PER_GB) == 0)
1015 printf(".");
1016
1017 /*
1018 * map page into kernel: valid, read/write,non-cacheable
1019 */
1020 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1021 invltlb();
1022
1023 tmp = *(int *)ptr;
1024 /*
1025 * Test for alternating 1's and 0's
1026 */
1027 *(volatile int *)ptr = 0xaaaaaaaa;
1028 if (*(volatile int *)ptr != 0xaaaaaaaa)
1029 page_bad = true;
1030 /*
1031 * Test for alternating 0's and 1's
1032 */
1033 *(volatile int *)ptr = 0x55555555;
1034 if (*(volatile int *)ptr != 0x55555555)
1035 page_bad = true;
1036 /*
1037 * Test for all 1's
1038 */
1039 *(volatile int *)ptr = 0xffffffff;
1040 if (*(volatile int *)ptr != 0xffffffff)
1041 page_bad = true;
1042 /*
1043 * Test for all 0's
1044 */
1045 *(volatile int *)ptr = 0x0;
1046 if (*(volatile int *)ptr != 0x0)
1047 page_bad = true;
1048 /*
1049 * Restore original value.
1050 */
1051 *(int *)ptr = tmp;
1052
1053 skip_memtest:
1054 /*
1055 * Adjust array of valid/good pages.
1056 */
1057 if (page_bad == true)
1058 continue;
1059 /*
1060 * If this good page is a continuation of the
1061 * previous set of good pages, then just increase
1062 * the end pointer. Otherwise start a new chunk.
1063 * Note that "end" points one higher than end,
1064 * making the range >= start and < end.
1065 * If we're also doing a speculative memory
1066 * test and we at or past the end, bump up Maxmem
1067 * so that we keep going. The first bad page
1068 * will terminate the loop.
1069 */
1070 if (phys_avail[pa_indx] == pa) {
1071 phys_avail[pa_indx] += PAGE_SIZE;
1072 } else {
1073 pa_indx++;
1074 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1075 printf(
1076 "Too many holes in the physical address space, giving up\n");
1077 pa_indx--;
1078 full = true;
1079 goto do_dump_avail;
1080 }
1081 phys_avail[pa_indx++] = pa; /* start */
1082 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1083 }
1084 physmem++;
1085 do_dump_avail:
1086 if (dump_avail[da_indx] == pa) {
1087 dump_avail[da_indx] += PAGE_SIZE;
1088 } else {
1089 da_indx++;
1090 if (da_indx == PHYS_AVAIL_ENTRIES) {
1091 da_indx--;
1092 goto do_next;
1093 }
1094 dump_avail[da_indx++] = pa; /* start */
1095 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1096 }
1097 do_next:
1098 if (full)
1099 break;
1100 }
1101 }
1102 *pte = 0;
1103 invltlb();
1104 if (memtest != 0)
1105 printf("\n");
1106
1107 /*
1108 * XXX
1109 * The last chunk must contain at least one page plus the message
1110 * buffer to avoid complicating other code (message buffer address
1111 * calculation, etc.).
1112 */
1113 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1114 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1115 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1116 phys_avail[pa_indx--] = 0;
1117 phys_avail[pa_indx--] = 0;
1118 }
1119
1120 Maxmem = atop(phys_avail[pa_indx]);
1121
1122 /* Trim off space for the message buffer. */
1123 phys_avail[pa_indx] -= round_page(msgbufsize);
1124
1125 /* Map the message buffer. */
1126 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1127 TSEXIT();
1128 }
1129
1130 static void
native_parse_preload_data(u_int64_t modulep)1131 native_parse_preload_data(u_int64_t modulep)
1132 {
1133 char *envp;
1134 #ifdef DDB
1135 vm_offset_t ksym_start;
1136 vm_offset_t ksym_end;
1137 #endif
1138
1139 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1140 preload_bootstrap_relocate(KERNBASE);
1141 preload_initkmdp(true);
1142 boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1143 envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1144 if (envp != NULL)
1145 envp += KERNBASE;
1146 init_static_kenv(envp, 0);
1147 #ifdef DDB
1148 ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1149 ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1150 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1151 #endif
1152 efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1153 vm_paddr_t);
1154 }
1155
1156 static void
native_clock_source_init(void)1157 native_clock_source_init(void)
1158 {
1159 i8254_init();
1160 }
1161
1162 static void
amd64_kdb_init(void)1163 amd64_kdb_init(void)
1164 {
1165 kdb_init();
1166 #ifdef KDB
1167 if (boothowto & RB_KDB)
1168 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1169 #endif
1170 }
1171
1172 /* Set up the fast syscall stuff */
1173 void
amd64_conf_fast_syscall(void)1174 amd64_conf_fast_syscall(void)
1175 {
1176 uint64_t msr;
1177
1178 msr = rdmsr(MSR_EFER) | EFER_SCE;
1179 wrmsr(MSR_EFER, msr);
1180 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1181 (u_int64_t)IDTVEC(fast_syscall));
1182 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1183 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1184 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1185 wrmsr(MSR_STAR, msr);
1186 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1187 }
1188
1189 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1190 amd64_bsp_pcpu_init1(struct pcpu *pc)
1191 {
1192 struct user_segment_descriptor *gdt;
1193
1194 PCPU_SET(prvspace, pc);
1195 gdt = *PCPU_PTR(gdt);
1196 PCPU_SET(curthread, &thread0);
1197 PCPU_SET(tssp, PCPU_PTR(common_tss));
1198 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1199 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1200 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1201 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1202 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1203 PCPU_SET(smp_tlb_gen, 1);
1204 }
1205
1206 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1207 amd64_bsp_pcpu_init2(uint64_t rsp0)
1208 {
1209
1210 PCPU_SET(rsp0, rsp0);
1211 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1212 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1213 PCPU_SET(curpcb, thread0.td_pcb);
1214 }
1215
1216 void
amd64_bsp_ist_init(struct pcpu * pc)1217 amd64_bsp_ist_init(struct pcpu *pc)
1218 {
1219 struct nmi_pcpu *np;
1220 struct amd64tss *tssp;
1221
1222 tssp = &pc->pc_common_tss;
1223
1224 /* doublefault stack space, runs on ist1 */
1225 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1226 np->np_pcpu = (register_t)pc;
1227 tssp->tss_ist1 = (long)np;
1228
1229 /*
1230 * NMI stack, runs on ist2. The pcpu pointer is stored just
1231 * above the start of the ist2 stack.
1232 */
1233 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1234 np->np_pcpu = (register_t)pc;
1235 tssp->tss_ist2 = (long)np;
1236
1237 /*
1238 * MC# stack, runs on ist3. The pcpu pointer is stored just
1239 * above the start of the ist3 stack.
1240 */
1241 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1242 np->np_pcpu = (register_t)pc;
1243 tssp->tss_ist3 = (long)np;
1244
1245 /*
1246 * DB# stack, runs on ist4.
1247 */
1248 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1249 np->np_pcpu = (register_t)pc;
1250 tssp->tss_ist4 = (long)np;
1251 }
1252
1253 /*
1254 * Calculate the kernel load address by inspecting page table created by loader.
1255 * The assumptions:
1256 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1257 * aligned at 2M, below 4G (the latter is important for AP startup)
1258 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1259 * - kernel is mapped with 2M superpages
1260 * - all participating memory, i.e. kernel, modules, metadata,
1261 * page table is accessible by pre-created 1:1 mapping
1262 * (right now loader creates 1:1 mapping for lower 4G, and all
1263 * memory is from there)
1264 * - there is a usable memory block right after the end of the
1265 * mapped kernel and all modules/metadata, pointed to by
1266 * physfree, for early allocations
1267 */
1268 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1269 amd64_loadaddr(void)
1270 {
1271 pml4_entry_t *pml4e;
1272 pdp_entry_t *pdpe;
1273 pd_entry_t *pde;
1274 uint64_t cr3;
1275
1276 cr3 = rcr3();
1277 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1278 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1279 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1280 return (*pde & PG_FRAME);
1281 }
1282
1283 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1284 hammer_time(u_int64_t modulep, u_int64_t physfree)
1285 {
1286 int gsel_tss, x;
1287 struct pcpu *pc;
1288 uint64_t rsp0;
1289 char *env;
1290 struct user_segment_descriptor *gdt;
1291 struct region_descriptor r_gdt;
1292 size_t kstack0_sz;
1293
1294 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1295
1296 kernphys = amd64_loadaddr();
1297
1298 physfree += kernphys;
1299
1300 /* Initializes preload_kmdp */
1301 init_ops.parse_preload_data(modulep);
1302
1303 efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1304 MODINFOMD_EFI_MAP) != NULL;
1305
1306 if (!efi_boot) {
1307 /* Tell the bios to warmboot next time */
1308 atomic_store_short((u_short *)0x472, 0x1234);
1309 }
1310
1311 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1312 physfree = roundup2(physfree, PAGE_SIZE);
1313
1314 identify_cpu1();
1315 identify_hypervisor();
1316 identify_hypervisor_smbios();
1317 identify_cpu_fixup_bsp();
1318 identify_cpu2();
1319 initializecpucache();
1320
1321 /*
1322 * Check for pti, pcid, and invpcid before ifuncs are
1323 * resolved, to correctly select the implementation for
1324 * pmap_activate_sw_mode().
1325 */
1326 pti = pti_get_default();
1327 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1328 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1329 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1330 invpcid_works = (cpu_stdext_feature &
1331 CPUID_STDEXT_INVPCID) != 0;
1332 } else {
1333 pmap_pcid_enabled = 0;
1334 }
1335
1336 /*
1337 * Now we can do small core initialization, after the PCID
1338 * CPU features and user knobs are evaluated.
1339 */
1340 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1341 &pmap_pcid_invlpg_workaround_uena);
1342 cpu_init_small_core();
1343
1344 if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1345 use_xsave = 1;
1346 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1347 }
1348
1349 link_elf_ireloc();
1350
1351 /*
1352 * This may be done better later if it gets more high level
1353 * components in it. If so just link td->td_proc here.
1354 */
1355 proc_linkup0(&proc0, &thread0);
1356
1357 /* Init basic tunables, hz etc */
1358 init_param1();
1359
1360 thread0.td_kstack = physfree - kernphys + KERNSTART;
1361 thread0.td_kstack_pages = kstack_pages;
1362 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1363 bzero((void *)thread0.td_kstack, kstack0_sz);
1364 physfree += kstack0_sz;
1365
1366 /*
1367 * Initialize enough of thread0 for delayed invalidation to
1368 * work very early. Rely on thread0.td_base_pri
1369 * zero-initialization, it is reset to PVM at proc0_init().
1370 */
1371 pmap_thread_init_invl_gen(&thread0);
1372
1373 pc = &temp_bsp_pcpu;
1374 pcpu_init(pc, 0, sizeof(struct pcpu));
1375 gdt = &temp_bsp_pcpu.pc_gdt[0];
1376
1377 /*
1378 * make gdt memory segments
1379 */
1380 for (x = 0; x < NGDT; x++) {
1381 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1382 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1383 ssdtosd(&gdt_segs[x], &gdt[x]);
1384 }
1385 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1386 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1387 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1388
1389 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1390 r_gdt.rd_base = (long)gdt;
1391 lgdt(&r_gdt);
1392
1393 wrmsr(MSR_FSBASE, 0); /* User value */
1394 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1395 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1396
1397 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1398 physfree += DPCPU_SIZE;
1399 amd64_bsp_pcpu_init1(pc);
1400 /* Non-late cninit() and printf() can be moved up to here. */
1401
1402 /*
1403 * Initialize mutexes.
1404 *
1405 * icu_lock: in order to allow an interrupt to occur in a critical
1406 * section, to set pcpu->ipending (etc...) properly, we
1407 * must be able to get the icu lock, so it can't be
1408 * under witness.
1409 */
1410 mutex_init();
1411 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1412 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1413
1414 /* exceptions */
1415 for (x = 0; x < NIDT; x++)
1416 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1417 SEL_KPL, 0);
1418 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1419 SEL_KPL, 0);
1420 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1421 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1422 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1423 SEL_UPL, 0);
1424 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1425 SEL_UPL, 0);
1426 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1427 SEL_KPL, 0);
1428 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1429 SEL_KPL, 0);
1430 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1431 SEL_KPL, 0);
1432 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1433 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1434 SDT_SYSIGT, SEL_KPL, 0);
1435 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1436 SEL_KPL, 0);
1437 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1438 SDT_SYSIGT, SEL_KPL, 0);
1439 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1440 SEL_KPL, 0);
1441 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1442 SEL_KPL, 0);
1443 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1444 SEL_KPL, 0);
1445 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1446 SEL_KPL, 0);
1447 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1448 SEL_KPL, 0);
1449 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1450 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1451 SEL_KPL, 0);
1452 #ifdef KDTRACE_HOOKS
1453 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1454 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1455 #endif
1456 #ifdef XENHVM
1457 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1458 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1459 #endif
1460 r_idt.rd_limit = sizeof(idt0) - 1;
1461 r_idt.rd_base = (long) idt;
1462 lidt(&r_idt);
1463
1464 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1465 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1466
1467 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1468 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1469
1470 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1471 &syscall_ret_l1d_flush_mode);
1472
1473 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1474 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1475
1476 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1477
1478 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1479 &x86_rngds_mitg_enable);
1480
1481 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1482 &zenbleed_enable);
1483 zenbleed_sanitize_enable();
1484
1485 finishidentcpu(); /* Final stage of CPU initialization */
1486
1487 invlpgb_works = (amd_extended_feature_extensions &
1488 AMDFEID_INVLPGB) != 0;
1489 TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1490 if (invlpgb_works)
1491 invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1492
1493 /*
1494 * Initialize the clock before the console so that console
1495 * initialization can use DELAY().
1496 */
1497 clock_init();
1498
1499 initializecpu(); /* Initialize CPU registers */
1500
1501 amd64_bsp_ist_init(pc);
1502
1503 /* Set the IO permission bitmap (empty due to tss seg limit) */
1504 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1505 IOPERM_BITMAP_SIZE;
1506
1507 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1508 ltr(gsel_tss);
1509
1510 amd64_conf_fast_syscall();
1511
1512 /*
1513 * We initialize the PCB pointer early so that exception
1514 * handlers will work. Also set up td_critnest to short-cut
1515 * the page fault handler.
1516 */
1517 cpu_max_ext_state_size = sizeof(struct savefpu);
1518 set_top_of_stack_td(&thread0);
1519 thread0.td_pcb = get_pcb_td(&thread0);
1520 thread0.td_critnest = 1;
1521
1522 /*
1523 * The console and kdb should be initialized even earlier than here,
1524 * but some console drivers don't work until after getmemsize().
1525 * Default to late console initialization to support these drivers.
1526 * This loses mainly printf()s in getmemsize() and early debugging.
1527 */
1528 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1529 if (!late_console) {
1530 cninit();
1531 amd64_kdb_init();
1532 }
1533
1534 getmemsize(physfree);
1535 init_param2(physmem);
1536
1537 /* now running on new page tables, configured,and u/iom is accessible */
1538
1539 #ifdef DEV_PCI
1540 /* This call might adjust phys_avail[]. */
1541 pci_early_quirks();
1542 #endif
1543
1544 if (late_console)
1545 cninit();
1546
1547 /*
1548 * Dump the boot metadata. We have to wait for cninit() since console
1549 * output is required. If it's grossly incorrect the kernel will never
1550 * make it this far.
1551 */
1552 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1553 preload_dump();
1554
1555 #ifdef DEV_ISA
1556 #ifdef DEV_ATPIC
1557 elcr_probe();
1558 atpic_startup();
1559 #else
1560 /* Reset and mask the atpics and leave them shut down. */
1561 atpic_reset();
1562
1563 /*
1564 * Point the ICU spurious interrupt vectors at the APIC spurious
1565 * interrupt handler.
1566 */
1567 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1568 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1569 #endif
1570 #else
1571 #error "have you forgotten the isa device?"
1572 #endif
1573
1574 if (late_console)
1575 amd64_kdb_init();
1576
1577 msgbufinit(msgbufp, msgbufsize);
1578 fpuinit();
1579
1580 /* make an initial tss so cpu can get interrupt stack on syscall! */
1581 rsp0 = thread0.td_md.md_stack_base;
1582 /* Ensure the stack is aligned to 16 bytes */
1583 rsp0 &= ~0xFul;
1584 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1585 amd64_bsp_pcpu_init2(rsp0);
1586
1587 /* transfer to user mode */
1588
1589 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1590 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1591 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1592 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1593 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1594
1595 load_ds(_udatasel);
1596 load_es(_udatasel);
1597 load_fs(_ufssel);
1598
1599 /* setup proc 0's pcb */
1600 thread0.td_pcb->pcb_flags = 0;
1601
1602 env = kern_getenv("kernelname");
1603 if (env != NULL)
1604 strlcpy(kernelname, env, sizeof(kernelname));
1605
1606 kcsan_cpu_init(0);
1607
1608 #ifdef FDT
1609 x86_init_fdt();
1610 #endif
1611 thread0.td_critnest = 0;
1612
1613 kasan_init();
1614 kmsan_init();
1615
1616 TSEXIT();
1617
1618 /* Location of kernel stack for locore */
1619 return (thread0.td_md.md_stack_base);
1620 }
1621
1622 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1623 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1624 {
1625
1626 pcpu->pc_acpi_id = 0xffffffff;
1627 }
1628
1629 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1630 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1631 {
1632 struct bios_smap *smapbase;
1633 struct bios_smap_xattr smap;
1634 uint32_t *smapattr;
1635 int count, error, i;
1636
1637 /* Retrieve the system memory map from the loader. */
1638 smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1639 MODINFO_METADATA | MODINFOMD_SMAP);
1640 if (smapbase == NULL)
1641 return (0);
1642 smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1643 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1644 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1645 error = 0;
1646 for (i = 0; i < count; i++) {
1647 smap.base = smapbase[i].base;
1648 smap.length = smapbase[i].length;
1649 smap.type = smapbase[i].type;
1650 if (smapattr != NULL)
1651 smap.xattr = smapattr[i];
1652 else
1653 smap.xattr = 0;
1654 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1655 }
1656 return (error);
1657 }
1658 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1659 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1660 smap_sysctl_handler, "S,bios_smap_xattr",
1661 "Raw BIOS SMAP data");
1662
1663 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1664 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1665 {
1666 struct efi_map_header *efihdr;
1667 uint32_t efisize;
1668
1669 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1670 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1671 if (efihdr == NULL)
1672 return (0);
1673 efisize = *((uint32_t *)efihdr - 1);
1674 return (SYSCTL_OUT(req, efihdr, efisize));
1675 }
1676 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1677 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1678 efi_map_sysctl_handler, "S,efi_map_header",
1679 "Raw EFI Memory Map");
1680
1681 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1682 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1683 {
1684 char *arch;
1685
1686 arch = (char *)preload_search_info(preload_kmdp,
1687 MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1688 if (arch == NULL)
1689 return (0);
1690
1691 return (SYSCTL_OUT_STR(req, arch));
1692 }
1693 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1694 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1695 efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1696
1697 void
spinlock_enter(void)1698 spinlock_enter(void)
1699 {
1700 struct thread *td;
1701 register_t flags;
1702
1703 td = curthread;
1704 if (td->td_md.md_spinlock_count == 0) {
1705 flags = intr_disable();
1706 td->td_md.md_spinlock_count = 1;
1707 td->td_md.md_saved_flags = flags;
1708 critical_enter();
1709 } else
1710 td->td_md.md_spinlock_count++;
1711 }
1712
1713 void
spinlock_exit(void)1714 spinlock_exit(void)
1715 {
1716 struct thread *td;
1717 register_t flags;
1718
1719 td = curthread;
1720 flags = td->td_md.md_saved_flags;
1721 td->td_md.md_spinlock_count--;
1722 if (td->td_md.md_spinlock_count == 0) {
1723 critical_exit();
1724 intr_restore(flags);
1725 }
1726 }
1727
1728 /*
1729 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1730 * we want to start a backtrace from the function that caused us to enter
1731 * the debugger. We have the context in the trapframe, but base the trace
1732 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1733 * enough for a backtrace.
1734 */
1735 void
makectx(struct trapframe * tf,struct pcb * pcb)1736 makectx(struct trapframe *tf, struct pcb *pcb)
1737 {
1738
1739 pcb->pcb_r12 = tf->tf_r12;
1740 pcb->pcb_r13 = tf->tf_r13;
1741 pcb->pcb_r14 = tf->tf_r14;
1742 pcb->pcb_r15 = tf->tf_r15;
1743 pcb->pcb_rbp = tf->tf_rbp;
1744 pcb->pcb_rbx = tf->tf_rbx;
1745 pcb->pcb_rip = tf->tf_rip;
1746 pcb->pcb_rsp = tf->tf_rsp;
1747 }
1748
1749 /*
1750 * The pcb_flags is only modified by current thread, or by other threads
1751 * when current thread is stopped. However, current thread may change it
1752 * from the interrupt context in cpu_switch(), or in the trap handler.
1753 * When we read-modify-write pcb_flags from C sources, compiler may generate
1754 * code that is not atomic regarding the interrupt handler. If a trap or
1755 * interrupt happens and any flag is modified from the handler, it can be
1756 * clobbered with the cached value later. Therefore, we implement setting
1757 * and clearing flags with single-instruction functions, which do not race
1758 * with possible modification of the flags from the trap or interrupt context,
1759 * because traps and interrupts are executed only on instruction boundary.
1760 */
1761 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1762 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1763 {
1764
1765 __asm __volatile("orl %1,%0"
1766 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1767 : "cc", "memory");
1768
1769 }
1770
1771 /*
1772 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1773 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1774 * pcb if user space modified the bases. We must save on the context
1775 * switch or if the return to usermode happens through the doreti.
1776 *
1777 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1778 * which have a consequence that the base MSRs must be saved each time
1779 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
1780 * context switches.
1781 */
1782 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1783 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1784 {
1785 register_t r;
1786
1787 if (curpcb == pcb &&
1788 (flags & PCB_FULL_IRET) != 0 &&
1789 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1790 r = intr_disable();
1791 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1792 if (rfs() == _ufssel)
1793 pcb->pcb_fsbase = rdfsbase();
1794 if (rgs() == _ugssel)
1795 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1796 }
1797 set_pcb_flags_raw(pcb, flags);
1798 intr_restore(r);
1799 } else {
1800 set_pcb_flags_raw(pcb, flags);
1801 }
1802 }
1803
1804 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1805 {
1806
1807 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1808 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1809 }
1810
1811 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1812 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1813 {
1814
1815 __asm __volatile("andl %1,%0"
1816 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1817 : "cc", "memory");
1818 }
1819
1820 #ifdef KDB
1821
1822 /*
1823 * Provide inb() and outb() as functions. They are normally only available as
1824 * inline functions, thus cannot be called from the debugger.
1825 */
1826
1827 /* silence compiler warnings */
1828 u_char inb_(u_short);
1829 void outb_(u_short, u_char);
1830
1831 u_char
inb_(u_short port)1832 inb_(u_short port)
1833 {
1834 return inb(port);
1835 }
1836
1837 void
outb_(u_short port,u_char data)1838 outb_(u_short port, u_char data)
1839 {
1840 outb(port, data);
1841 }
1842
1843 #endif /* KDB */
1844
1845 #undef memset
1846 #undef memmove
1847 #undef memcpy
1848
1849 void *memset_std(void *buf, int c, size_t len);
1850 void *memset_erms(void *buf, int c, size_t len);
1851 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1852 size_t len);
1853 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1854 size_t len);
1855 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1856 size_t len);
1857 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1858 size_t len);
1859
1860 #ifdef KCSAN
1861 /*
1862 * These fail to build as ifuncs when used with KCSAN.
1863 */
1864 void *
memset(void * buf,int c,size_t len)1865 memset(void *buf, int c, size_t len)
1866 {
1867
1868 return (memset_std(buf, c, len));
1869 }
1870
1871 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1872 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1873 {
1874
1875 return (memmove_std(dst, src, len));
1876 }
1877
1878 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1879 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1880 {
1881
1882 return (memcpy_std(dst, src, len));
1883 }
1884 #else
1885 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1886 {
1887
1888 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1889 memset_erms : memset_std);
1890 }
1891
1892 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1893 size_t))
1894 {
1895
1896 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1897 memmove_erms : memmove_std);
1898 }
1899
1900 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1901 {
1902
1903 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1904 memcpy_erms : memcpy_std);
1905 }
1906 #endif
1907
1908 void pagezero_std(void *addr);
1909 void pagezero_erms(void *addr);
1910 DEFINE_IFUNC(, void , pagezero, (void *))
1911 {
1912
1913 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1914 pagezero_erms : pagezero_std);
1915 }
1916