1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113
114 #include <net/netisr.h>
115
116 #include <dev/smbios/smbios.h>
117
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155 /*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170
171 /* Preload data parse function */
172 static void native_parse_preload_data(u_int64_t);
173
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(vm_paddr_t *, int *);
176
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 .parse_preload_data = native_parse_preload_data,
180 .early_clock_source_init = native_clock_source_init,
181 .early_delay = i8254_delay,
182 .parse_memmap = native_parse_memmap,
183 };
184
185 /*
186 * Physical address of the EFI System Table. Stashed from the metadata hints
187 * passed into the kernel and used by the EFI code to call runtime services.
188 */
189 vm_paddr_t efi_systbl_phys;
190
191 /*
192 * Bitmap of extra EFI memory region types that should be preserved and mapped
193 * during runtime services calls.
194 */
195 uint32_t efi_map_regs;
196
197 /* Intel ICH registers */
198 #define ICH_PMBASE 0x400
199 #define ICH_SMI_EN ICH_PMBASE + 0x30
200
201 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
202
203 int cold = 1;
204
205 long Maxmem = 0;
206 long realmem = 0;
207 int late_console = 1;
208
209 struct kva_md_info kmi;
210
211 struct region_descriptor r_idt;
212
213 struct pcpu *__pcpu;
214 struct pcpu temp_bsp_pcpu;
215
216 struct mtx icu_lock;
217
218 struct mem_range_softc mem_range_softc;
219
220 struct mtx dt_lock; /* lock for GDT and LDT */
221
222 void (*vmm_suspend_p)(void);
223 void (*vmm_resume_p)(void);
224
225 bool efi_boot;
226
227 static void
cpu_startup(void * dummy)228 cpu_startup(void *dummy)
229 {
230 uintmax_t memsize;
231 char *sysenv;
232
233 /*
234 * On MacBooks, we need to disallow the legacy USB circuit to
235 * generate an SMI# because this can cause several problems,
236 * namely: incorrect CPU frequency detection and failure to
237 * start the APs.
238 * We do this by disabling a bit in the SMI_EN (SMI Control and
239 * Enable register) of the Intel ICH LPC Interface Bridge.
240 */
241 sysenv = kern_getenv("smbios.system.product");
242 if (sysenv != NULL) {
243 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
244 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
245 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
246 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
247 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
248 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
249 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
250 strncmp(sysenv, "Macmini1,1", 10) == 0) {
251 if (bootverbose)
252 printf("Disabling LEGACY_USB_EN bit on "
253 "Intel ICH.\n");
254 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
255 }
256 freeenv(sysenv);
257 }
258
259 /*
260 * Good {morning,afternoon,evening,night}.
261 */
262 startrtclock();
263 printcpuinfo();
264
265 /*
266 * Display physical memory if SMBIOS reports reasonable amount.
267 */
268 memsize = 0;
269 sysenv = kern_getenv("smbios.memory.enabled");
270 if (sysenv != NULL) {
271 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
272 freeenv(sysenv);
273 }
274 if (memsize < ptoa((uintmax_t)vm_free_count()))
275 memsize = ptoa((uintmax_t)Maxmem);
276 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
277 realmem = atop(memsize);
278
279 /*
280 * Display any holes after the first chunk of extended memory.
281 */
282 if (bootverbose) {
283 int indx;
284
285 printf("Physical memory chunk(s):\n");
286 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
287 vm_paddr_t size;
288
289 size = phys_avail[indx + 1] - phys_avail[indx];
290 printf(
291 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
292 (uintmax_t)phys_avail[indx],
293 (uintmax_t)phys_avail[indx + 1] - 1,
294 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
295 }
296 }
297
298 vm_ksubmap_init(&kmi);
299
300 printf("avail memory = %ju (%ju MB)\n",
301 ptoa((uintmax_t)vm_free_count()),
302 ptoa((uintmax_t)vm_free_count()) / 1048576);
303 #ifdef DEV_PCI
304 if (bootverbose && intel_graphics_stolen_base != 0)
305 printf("intel stolen mem: base %#jx size %ju MB\n",
306 (uintmax_t)intel_graphics_stolen_base,
307 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
308 #endif
309
310 /*
311 * Set up buffers, so they can be used to read disk labels.
312 */
313 bufinit();
314 vm_pager_bufferinit();
315
316 cpu_setregs();
317 }
318
319 static void
late_ifunc_resolve(void * dummy __unused)320 late_ifunc_resolve(void *dummy __unused)
321 {
322 link_elf_late_ireloc();
323 }
324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
325
326
327 void
cpu_setregs(void)328 cpu_setregs(void)
329 {
330 register_t cr0;
331
332 TSENTER();
333 cr0 = rcr0();
334 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
335 TSENTER2("load_cr0");
336 load_cr0(cr0);
337 TSEXIT2("load_cr0");
338 TSEXIT();
339 }
340
341 /*
342 * Initialize amd64 and configure to run kernel
343 */
344
345 /*
346 * Initialize segments & interrupt table
347 */
348 static struct gate_descriptor idt0[NIDT];
349 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
350
351 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
352 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
353 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
354 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
355 CTASSERT(sizeof(struct nmi_pcpu) == 16);
356
357 /*
358 * Software prototypes -- in more palatable form.
359 *
360 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
361 * slots as corresponding segments for i386 kernel.
362 */
363 struct soft_segment_descriptor gdt_segs[] = {
364 [GNULL_SEL] = { /* 0 Null Descriptor */
365 .ssd_base = 0x0,
366 .ssd_limit = 0x0,
367 .ssd_type = 0,
368 .ssd_dpl = 0,
369 .ssd_p = 0,
370 .ssd_long = 0,
371 .ssd_def32 = 0,
372 .ssd_gran = 0 },
373 [GNULL2_SEL] = { /* 1 Null Descriptor */
374 .ssd_base = 0x0,
375 .ssd_limit = 0x0,
376 .ssd_type = 0,
377 .ssd_dpl = 0,
378 .ssd_p = 0,
379 .ssd_long = 0,
380 .ssd_def32 = 0,
381 .ssd_gran = 0 },
382 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
383 .ssd_base = 0x0,
384 .ssd_limit = 0xfffff,
385 .ssd_type = SDT_MEMRWA,
386 .ssd_dpl = SEL_UPL,
387 .ssd_p = 1,
388 .ssd_long = 0,
389 .ssd_def32 = 1,
390 .ssd_gran = 1 },
391 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
392 .ssd_base = 0x0,
393 .ssd_limit = 0xfffff,
394 .ssd_type = SDT_MEMRWA,
395 .ssd_dpl = SEL_UPL,
396 .ssd_p = 1,
397 .ssd_long = 0,
398 .ssd_def32 = 1,
399 .ssd_gran = 1 },
400 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
401 .ssd_base = 0x0,
402 .ssd_limit = 0xfffff,
403 .ssd_type = SDT_MEMERA,
404 .ssd_dpl = SEL_KPL,
405 .ssd_p = 1,
406 .ssd_long = 1,
407 .ssd_def32 = 0,
408 .ssd_gran = 1 },
409 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
410 .ssd_base = 0x0,
411 .ssd_limit = 0xfffff,
412 .ssd_type = SDT_MEMRWA,
413 .ssd_dpl = SEL_KPL,
414 .ssd_p = 1,
415 .ssd_long = 1,
416 .ssd_def32 = 0,
417 .ssd_gran = 1 },
418 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
419 .ssd_base = 0x0,
420 .ssd_limit = 0xfffff,
421 .ssd_type = SDT_MEMERA,
422 .ssd_dpl = SEL_UPL,
423 .ssd_p = 1,
424 .ssd_long = 0,
425 .ssd_def32 = 1,
426 .ssd_gran = 1 },
427 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
428 .ssd_base = 0x0,
429 .ssd_limit = 0xfffff,
430 .ssd_type = SDT_MEMRWA,
431 .ssd_dpl = SEL_UPL,
432 .ssd_p = 1,
433 .ssd_long = 0,
434 .ssd_def32 = 1,
435 .ssd_gran = 1 },
436 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
437 .ssd_base = 0x0,
438 .ssd_limit = 0xfffff,
439 .ssd_type = SDT_MEMERA,
440 .ssd_dpl = SEL_UPL,
441 .ssd_p = 1,
442 .ssd_long = 1,
443 .ssd_def32 = 0,
444 .ssd_gran = 1 },
445 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
446 .ssd_base = 0x0,
447 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
448 .ssd_type = SDT_SYSTSS,
449 .ssd_dpl = SEL_KPL,
450 .ssd_p = 1,
451 .ssd_long = 0,
452 .ssd_def32 = 0,
453 .ssd_gran = 0 },
454 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
455 .ssd_base = 0x0,
456 .ssd_limit = 0x0,
457 .ssd_type = 0,
458 .ssd_dpl = 0,
459 .ssd_p = 0,
460 .ssd_long = 0,
461 .ssd_def32 = 0,
462 .ssd_gran = 0 },
463 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
464 .ssd_base = 0x0,
465 .ssd_limit = 0x0,
466 .ssd_type = 0,
467 .ssd_dpl = 0,
468 .ssd_p = 0,
469 .ssd_long = 0,
470 .ssd_def32 = 0,
471 .ssd_gran = 0 },
472 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
473 .ssd_base = 0x0,
474 .ssd_limit = 0x0,
475 .ssd_type = 0,
476 .ssd_dpl = 0,
477 .ssd_p = 0,
478 .ssd_long = 0,
479 .ssd_def32 = 0,
480 .ssd_gran = 0 },
481 };
482 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
483
484 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)485 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
486 {
487 struct gate_descriptor *ip;
488
489 ip = idt + idx;
490 ip->gd_looffset = (uintptr_t)func;
491 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
492 ip->gd_ist = ist;
493 ip->gd_xx = 0;
494 ip->gd_type = typ;
495 ip->gd_dpl = dpl;
496 ip->gd_p = 1;
497 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
498 }
499
500 extern inthand_t
501 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
502 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
503 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
504 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
505 IDTVEC(xmm), IDTVEC(dblfault),
506 IDTVEC(div_pti), IDTVEC(bpt_pti),
507 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
508 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
509 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
510 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
511 IDTVEC(xmm_pti),
512 #ifdef KDTRACE_HOOKS
513 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
514 #endif
515 #ifdef XENHVM
516 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
517 #endif
518 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
519 IDTVEC(fast_syscall_pti);
520
521 #ifdef DDB
522 /*
523 * Display the index and function name of any IDT entries that don't use
524 * the default 'rsvd' entry point.
525 */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)526 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
527 {
528 struct gate_descriptor *ip;
529 int idx;
530 uintptr_t func;
531
532 ip = idt;
533 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
534 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
535 if (func != (uintptr_t)&IDTVEC(rsvd)) {
536 db_printf("%3d\t", idx);
537 db_printsym(func, DB_STGY_PROC);
538 db_printf("\n");
539 }
540 ip++;
541 }
542 }
543
544 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)545 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
546 {
547 struct {
548 uint16_t limit;
549 uint64_t base;
550 } __packed idtr, gdtr;
551 uint16_t ldt, tr;
552
553 __asm __volatile("sidt %0" : "=m" (idtr));
554 db_printf("idtr\t0x%016lx/%04x\n",
555 (u_long)idtr.base, (u_int)idtr.limit);
556 __asm __volatile("sgdt %0" : "=m" (gdtr));
557 db_printf("gdtr\t0x%016lx/%04x\n",
558 (u_long)gdtr.base, (u_int)gdtr.limit);
559 __asm __volatile("sldt %0" : "=r" (ldt));
560 db_printf("ldtr\t0x%04x\n", ldt);
561 __asm __volatile("str %0" : "=r" (tr));
562 db_printf("tr\t0x%04x\n", tr);
563 db_printf("cr0\t0x%016lx\n", rcr0());
564 db_printf("cr2\t0x%016lx\n", rcr2());
565 db_printf("cr3\t0x%016lx\n", rcr3());
566 db_printf("cr4\t0x%016lx\n", rcr4());
567 if (rcr4() & CR4_XSAVE)
568 db_printf("xcr0\t0x%016lx\n", rxcr(0));
569 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
570 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
571 db_printf("FEATURES_CTL\t%016lx\n",
572 rdmsr(MSR_IA32_FEATURE_CONTROL));
573 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
574 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
575 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
576 }
577
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)578 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
579 {
580
581 db_printf("dr0\t0x%016lx\n", rdr0());
582 db_printf("dr1\t0x%016lx\n", rdr1());
583 db_printf("dr2\t0x%016lx\n", rdr2());
584 db_printf("dr3\t0x%016lx\n", rdr3());
585 db_printf("dr6\t0x%016lx\n", rdr6());
586 db_printf("dr7\t0x%016lx\n", rdr7());
587 }
588 #endif
589
590 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)591 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
592 {
593
594 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
595 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
596 ssd->ssd_type = sd->sd_type;
597 ssd->ssd_dpl = sd->sd_dpl;
598 ssd->ssd_p = sd->sd_p;
599 ssd->ssd_long = sd->sd_long;
600 ssd->ssd_def32 = sd->sd_def32;
601 ssd->ssd_gran = sd->sd_gran;
602 }
603
604 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)605 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
606 {
607
608 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
609 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
610 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
611 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
612 sd->sd_type = ssd->ssd_type;
613 sd->sd_dpl = ssd->ssd_dpl;
614 sd->sd_p = ssd->ssd_p;
615 sd->sd_long = ssd->ssd_long;
616 sd->sd_def32 = ssd->ssd_def32;
617 sd->sd_gran = ssd->ssd_gran;
618 }
619
620 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)621 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
622 {
623
624 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
625 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
626 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
627 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
628 sd->sd_type = ssd->ssd_type;
629 sd->sd_dpl = ssd->ssd_dpl;
630 sd->sd_p = ssd->ssd_p;
631 sd->sd_gran = ssd->ssd_gran;
632 }
633
634 u_int basemem;
635
636 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)637 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
638 int *physmap_idxp)
639 {
640 int i, insert_idx, physmap_idx;
641
642 physmap_idx = *physmap_idxp;
643
644 if (length == 0)
645 return (1);
646
647 /*
648 * Find insertion point while checking for overlap. Start off by
649 * assuming the new entry will be added to the end.
650 *
651 * NB: physmap_idx points to the next free slot.
652 */
653 insert_idx = physmap_idx;
654 for (i = 0; i < physmap_idx; i += 2) {
655 if (base < physmap[i + 1]) {
656 if (base + length <= physmap[i]) {
657 insert_idx = i;
658 break;
659 }
660 if (boothowto & RB_VERBOSE)
661 printf(
662 "Overlapping memory regions, ignoring second region\n");
663 return (1);
664 }
665 }
666
667 /* See if we can prepend to the next entry. */
668 if (insert_idx < physmap_idx && base + length == physmap[insert_idx]) {
669 physmap[insert_idx] = base;
670 return (1);
671 }
672
673 /* See if we can append to the previous entry. */
674 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
675 physmap[insert_idx - 1] += length;
676 return (1);
677 }
678
679 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
680 printf(
681 "Too many segments in the physical address map, giving up\n");
682 return (0);
683 }
684
685 /*
686 * Move the last 'N' entries down to make room for the new
687 * entry if needed.
688 */
689 for (i = physmap_idx; i > insert_idx; i -= 2) {
690 physmap[i] = physmap[i - 2];
691 physmap[i + 1] = physmap[i - 1];
692 }
693
694 physmap_idx += 2;
695 *physmap_idxp = physmap_idx;
696
697 /* Insert the new entry. */
698 physmap[insert_idx] = base;
699 physmap[insert_idx + 1] = base + length;
700 return (1);
701 }
702
703 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)704 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
705 vm_paddr_t *physmap, int *physmap_idx)
706 {
707 struct bios_smap *smap, *smapend;
708
709 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
710
711 for (smap = smapbase; smap < smapend; smap++) {
712 if (boothowto & RB_VERBOSE)
713 printf("SMAP type=%02x base=%016lx len=%016lx\n",
714 smap->type, smap->base, smap->length);
715
716 if (smap->type != SMAP_TYPE_MEMORY)
717 continue;
718
719 if (!add_physmap_entry(smap->base, smap->length, physmap,
720 physmap_idx))
721 break;
722 }
723 }
724
725 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)726 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
727 int *physmap_idx)
728 {
729 struct efi_md *map, *p;
730 const char *type;
731 size_t efisz;
732 int ndesc, i;
733
734 static const char *types[] = {
735 "Reserved",
736 "LoaderCode",
737 "LoaderData",
738 "BootServicesCode",
739 "BootServicesData",
740 "RuntimeServicesCode",
741 "RuntimeServicesData",
742 "ConventionalMemory",
743 "UnusableMemory",
744 "ACPIReclaimMemory",
745 "ACPIMemoryNVS",
746 "MemoryMappedIO",
747 "MemoryMappedIOPortSpace",
748 "PalCode",
749 "PersistentMemory"
750 };
751
752 /*
753 * Memory map data provided by UEFI via the GetMemoryMap
754 * Boot Services API.
755 */
756 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
757 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
758
759 if (efihdr->descriptor_size == 0)
760 return;
761 ndesc = efihdr->memory_size / efihdr->descriptor_size;
762
763 if (boothowto & RB_VERBOSE)
764 printf("%23s %12s %12s %8s %4s\n",
765 "Type", "Physical", "Virtual", "#Pages", "Attr");
766
767 TUNABLE_INT_FETCH("machdep.efirt.regs", &efi_map_regs);
768 for (i = 0, p = map; i < ndesc; i++,
769 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
770 if (boothowto & RB_VERBOSE) {
771 if (p->md_type < nitems(types))
772 type = types[p->md_type];
773 else
774 type = "<INVALID>";
775 printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
776 p->md_virt, p->md_pages);
777 if (p->md_attr & EFI_MD_ATTR_UC)
778 printf("UC ");
779 if (p->md_attr & EFI_MD_ATTR_WC)
780 printf("WC ");
781 if (p->md_attr & EFI_MD_ATTR_WT)
782 printf("WT ");
783 if (p->md_attr & EFI_MD_ATTR_WB)
784 printf("WB ");
785 if (p->md_attr & EFI_MD_ATTR_UCE)
786 printf("UCE ");
787 if (p->md_attr & EFI_MD_ATTR_WP)
788 printf("WP ");
789 if (p->md_attr & EFI_MD_ATTR_RP)
790 printf("RP ");
791 if (p->md_attr & EFI_MD_ATTR_XP)
792 printf("XP ");
793 if (p->md_attr & EFI_MD_ATTR_NV)
794 printf("NV ");
795 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
796 printf("MORE_RELIABLE ");
797 if (p->md_attr & EFI_MD_ATTR_RO)
798 printf("RO ");
799 if (p->md_attr & EFI_MD_ATTR_RT)
800 printf("RUNTIME");
801 printf("\n");
802 }
803
804 switch (p->md_type) {
805 case EFI_MD_TYPE_BS_CODE:
806 case EFI_MD_TYPE_BS_DATA:
807 if (EFI_MAP_BOOTTYPE_ALLOWED(p->md_type))
808 continue;
809 /* FALLTHROUGH */
810 case EFI_MD_TYPE_CODE:
811 case EFI_MD_TYPE_DATA:
812 case EFI_MD_TYPE_FREE:
813 /*
814 * We're allowed to use any entry with these types.
815 */
816 break;
817 default:
818 continue;
819 }
820
821 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
822 physmap, physmap_idx))
823 break;
824 }
825 }
826
827 static void
native_parse_memmap(vm_paddr_t * physmap,int * physmap_idx)828 native_parse_memmap(vm_paddr_t *physmap, int *physmap_idx)
829 {
830 struct bios_smap *smap;
831 struct efi_map_header *efihdr;
832 u_int32_t size;
833
834 /*
835 * Memory map from INT 15:E820.
836 *
837 * subr_module.c says:
838 * "Consumer may safely assume that size value precedes data."
839 * ie: an int32_t immediately precedes smap.
840 */
841
842 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
843 MODINFO_METADATA | MODINFOMD_EFI_MAP);
844 smap = (struct bios_smap *)preload_search_info(preload_kmdp,
845 MODINFO_METADATA | MODINFOMD_SMAP);
846 if (efihdr == NULL && smap == NULL)
847 panic("No BIOS smap or EFI map info from loader!");
848
849 if (efihdr != NULL) {
850 add_efi_map_entries(efihdr, physmap, physmap_idx);
851 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
852 } else {
853 size = *((u_int32_t *)smap - 1);
854 bios_add_smap_entries(smap, size, physmap, physmap_idx);
855 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
856 }
857 }
858
859 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
860
861 /*
862 * Populate the (physmap) array with base/bound pairs describing the
863 * available physical memory in the system, then test this memory and
864 * build the phys_avail array describing the actually-available memory.
865 *
866 * Total memory size may be set by the kernel environment variable
867 * hw.physmem or the compile-time define MAXMEM.
868 *
869 * XXX first should be vm_paddr_t.
870 */
871 static void
getmemsize(u_int64_t first)872 getmemsize(u_int64_t first)
873 {
874 int i, physmap_idx, pa_indx, da_indx;
875 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
876 u_long physmem_start, physmem_tunable, memtest;
877 pt_entry_t *pte;
878 quad_t dcons_addr, dcons_size;
879 int page_counter;
880
881 TSENTER();
882 /*
883 * Tell the physical memory allocator about pages used to store
884 * the kernel and preloaded data. See kmem_bootstrap_free().
885 */
886 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
887
888 bzero(physmap, sizeof(physmap));
889 physmap_idx = 0;
890
891 init_ops.parse_memmap(physmap, &physmap_idx);
892 physmap_idx -= 2;
893
894 /*
895 * Find the 'base memory' segment for SMP
896 */
897 basemem = 0;
898 for (i = 0; i <= physmap_idx; i += 2) {
899 if (physmap[i] <= 0xA0000) {
900 basemem = physmap[i + 1] / 1024;
901 break;
902 }
903 }
904 if (basemem == 0 || basemem > 640) {
905 if (bootverbose)
906 printf(
907 "Memory map doesn't contain a basemem segment, faking it");
908 basemem = 640;
909 }
910
911 /*
912 * Maxmem isn't the "maximum memory", it's one larger than the
913 * highest page of the physical address space. It should be
914 * called something like "Maxphyspage". We may adjust this
915 * based on ``hw.physmem'' and the results of the memory test.
916 */
917 Maxmem = atop(physmap[physmap_idx + 1]);
918
919 #ifdef MAXMEM
920 Maxmem = MAXMEM / 4;
921 #endif
922
923 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
924 Maxmem = atop(physmem_tunable);
925
926 /*
927 * The boot memory test is disabled by default, as it takes a
928 * significant amount of time on large-memory systems, and is
929 * unfriendly to virtual machines as it unnecessarily touches all
930 * pages.
931 *
932 * A general name is used as the code may be extended to support
933 * additional tests beyond the current "page present" test.
934 */
935 memtest = 0;
936 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
937
938 /*
939 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
940 * in the system.
941 */
942 if (Maxmem > atop(physmap[physmap_idx + 1]))
943 Maxmem = atop(physmap[physmap_idx + 1]);
944
945 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
946 (boothowto & RB_VERBOSE))
947 printf("Physical memory use set to %ldK\n", Maxmem * 4);
948
949 /* call pmap initialization to make new kernel address space */
950 pmap_bootstrap(&first);
951
952 /*
953 * Size up each available chunk of physical memory.
954 *
955 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
956 * By default, mask off the first 16 pages unless we appear to be
957 * running in a VM.
958 */
959 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
960 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
961 if (physmap[0] < physmem_start) {
962 if (physmem_start < PAGE_SIZE)
963 physmap[0] = PAGE_SIZE;
964 else if (physmem_start >= physmap[1])
965 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
966 else
967 physmap[0] = round_page(physmem_start);
968 }
969 pa_indx = 0;
970 da_indx = 1;
971 phys_avail[pa_indx++] = physmap[0];
972 phys_avail[pa_indx] = physmap[0];
973 dump_avail[da_indx] = physmap[0];
974 pte = CMAP1;
975
976 /*
977 * Get dcons buffer address
978 */
979 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
980 getenv_quad("dcons.size", &dcons_size) == 0)
981 dcons_addr = 0;
982
983 /*
984 * physmap is in bytes, so when converting to page boundaries,
985 * round up the start address and round down the end address.
986 */
987 page_counter = 0;
988 if (memtest != 0)
989 printf("Testing system memory");
990 for (i = 0; i <= physmap_idx; i += 2) {
991 vm_paddr_t end;
992
993 end = ptoa((vm_paddr_t)Maxmem);
994 if (physmap[i + 1] < end)
995 end = trunc_page(physmap[i + 1]);
996 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
997 int *ptr = (int *)CADDR1;
998 int tmp;
999 bool full, page_bad;
1000
1001 full = false;
1002 /*
1003 * block out kernel memory as not available.
1004 */
1005 if (pa >= (vm_paddr_t)kernphys && pa < first)
1006 goto do_dump_avail;
1007
1008 /*
1009 * block out dcons buffer
1010 */
1011 if (dcons_addr > 0
1012 && pa >= trunc_page(dcons_addr)
1013 && pa < dcons_addr + dcons_size)
1014 goto do_dump_avail;
1015
1016 page_bad = false;
1017 if (memtest == 0)
1018 goto skip_memtest;
1019
1020 /*
1021 * Print a "." every GB to show we're making
1022 * progress.
1023 */
1024 page_counter++;
1025 if ((page_counter % PAGES_PER_GB) == 0)
1026 printf(".");
1027
1028 /*
1029 * map page into kernel: valid, read/write,non-cacheable
1030 */
1031 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1032 invltlb();
1033
1034 tmp = *(int *)ptr;
1035 /*
1036 * Test for alternating 1's and 0's
1037 */
1038 *(volatile int *)ptr = 0xaaaaaaaa;
1039 if (*(volatile int *)ptr != 0xaaaaaaaa)
1040 page_bad = true;
1041 /*
1042 * Test for alternating 0's and 1's
1043 */
1044 *(volatile int *)ptr = 0x55555555;
1045 if (*(volatile int *)ptr != 0x55555555)
1046 page_bad = true;
1047 /*
1048 * Test for all 1's
1049 */
1050 *(volatile int *)ptr = 0xffffffff;
1051 if (*(volatile int *)ptr != 0xffffffff)
1052 page_bad = true;
1053 /*
1054 * Test for all 0's
1055 */
1056 *(volatile int *)ptr = 0x0;
1057 if (*(volatile int *)ptr != 0x0)
1058 page_bad = true;
1059 /*
1060 * Restore original value.
1061 */
1062 *(int *)ptr = tmp;
1063
1064 skip_memtest:
1065 /*
1066 * Adjust array of valid/good pages.
1067 */
1068 if (page_bad == true)
1069 continue;
1070 /*
1071 * If this good page is a continuation of the
1072 * previous set of good pages, then just increase
1073 * the end pointer. Otherwise start a new chunk.
1074 * Note that "end" points one higher than end,
1075 * making the range >= start and < end.
1076 * If we're also doing a speculative memory
1077 * test and we at or past the end, bump up Maxmem
1078 * so that we keep going. The first bad page
1079 * will terminate the loop.
1080 */
1081 if (phys_avail[pa_indx] == pa) {
1082 phys_avail[pa_indx] += PAGE_SIZE;
1083 } else {
1084 pa_indx++;
1085 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1086 printf(
1087 "Too many holes in the physical address space, giving up\n");
1088 pa_indx--;
1089 full = true;
1090 goto do_dump_avail;
1091 }
1092 phys_avail[pa_indx++] = pa; /* start */
1093 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1094 }
1095 physmem++;
1096 do_dump_avail:
1097 if (dump_avail[da_indx] == pa) {
1098 dump_avail[da_indx] += PAGE_SIZE;
1099 } else {
1100 da_indx++;
1101 if (da_indx == PHYS_AVAIL_ENTRIES) {
1102 da_indx--;
1103 goto do_next;
1104 }
1105 dump_avail[da_indx++] = pa; /* start */
1106 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1107 }
1108 do_next:
1109 if (full)
1110 break;
1111 }
1112 }
1113 *pte = 0;
1114 invltlb();
1115 if (memtest != 0)
1116 printf("\n");
1117
1118 /*
1119 * XXX
1120 * The last chunk must contain at least one page plus the message
1121 * buffer to avoid complicating other code (message buffer address
1122 * calculation, etc.).
1123 */
1124 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1125 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1126 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1127 phys_avail[pa_indx--] = 0;
1128 phys_avail[pa_indx--] = 0;
1129 }
1130
1131 Maxmem = atop(phys_avail[pa_indx]);
1132
1133 /* Trim off space for the message buffer. */
1134 phys_avail[pa_indx] -= round_page(msgbufsize);
1135
1136 /* Map the message buffer. */
1137 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1138 TSEXIT();
1139 }
1140
1141 static void
native_parse_preload_data(u_int64_t modulep)1142 native_parse_preload_data(u_int64_t modulep)
1143 {
1144 char *envp;
1145 #ifdef DDB
1146 vm_offset_t ksym_start;
1147 vm_offset_t ksym_end;
1148 #endif
1149
1150 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1151 preload_bootstrap_relocate(KERNBASE);
1152 preload_initkmdp(true);
1153 boothowto = MD_FETCH(preload_kmdp, MODINFOMD_HOWTO, int);
1154 envp = MD_FETCH(preload_kmdp, MODINFOMD_ENVP, char *);
1155 if (envp != NULL)
1156 envp += KERNBASE;
1157 init_static_kenv(envp, 0);
1158 #ifdef DDB
1159 ksym_start = MD_FETCH(preload_kmdp, MODINFOMD_SSYM, uintptr_t);
1160 ksym_end = MD_FETCH(preload_kmdp, MODINFOMD_ESYM, uintptr_t);
1161 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1162 #endif
1163 efi_systbl_phys = MD_FETCH(preload_kmdp, MODINFOMD_FW_HANDLE,
1164 vm_paddr_t);
1165 }
1166
1167 static void
native_clock_source_init(void)1168 native_clock_source_init(void)
1169 {
1170 i8254_init();
1171 }
1172
1173 static void
amd64_kdb_init(void)1174 amd64_kdb_init(void)
1175 {
1176 kdb_init();
1177 #ifdef KDB
1178 if (boothowto & RB_KDB)
1179 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1180 #endif
1181 }
1182
1183 /* Set up the fast syscall stuff */
1184 void
amd64_conf_fast_syscall(void)1185 amd64_conf_fast_syscall(void)
1186 {
1187 uint64_t msr;
1188
1189 msr = rdmsr(MSR_EFER) | EFER_SCE;
1190 wrmsr(MSR_EFER, msr);
1191 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1192 (u_int64_t)IDTVEC(fast_syscall));
1193 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1194 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1195 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1196 wrmsr(MSR_STAR, msr);
1197 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1198 }
1199
1200 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1201 amd64_bsp_pcpu_init1(struct pcpu *pc)
1202 {
1203 struct user_segment_descriptor *gdt;
1204
1205 PCPU_SET(prvspace, pc);
1206 gdt = *PCPU_PTR(gdt);
1207 PCPU_SET(curthread, &thread0);
1208 PCPU_SET(tssp, PCPU_PTR(common_tss));
1209 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1210 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1211 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1212 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1213 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1214 PCPU_SET(smp_tlb_gen, 1);
1215 }
1216
1217 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1218 amd64_bsp_pcpu_init2(uint64_t rsp0)
1219 {
1220
1221 PCPU_SET(rsp0, rsp0);
1222 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1223 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1224 PCPU_SET(curpcb, thread0.td_pcb);
1225 }
1226
1227 void
amd64_bsp_ist_init(struct pcpu * pc)1228 amd64_bsp_ist_init(struct pcpu *pc)
1229 {
1230 struct nmi_pcpu *np;
1231 struct amd64tss *tssp;
1232
1233 tssp = &pc->pc_common_tss;
1234
1235 /* doublefault stack space, runs on ist1 */
1236 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1237 np->np_pcpu = (register_t)pc;
1238 tssp->tss_ist1 = (long)np;
1239
1240 /*
1241 * NMI stack, runs on ist2. The pcpu pointer is stored just
1242 * above the start of the ist2 stack.
1243 */
1244 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1245 np->np_pcpu = (register_t)pc;
1246 tssp->tss_ist2 = (long)np;
1247
1248 /*
1249 * MC# stack, runs on ist3. The pcpu pointer is stored just
1250 * above the start of the ist3 stack.
1251 */
1252 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1253 np->np_pcpu = (register_t)pc;
1254 tssp->tss_ist3 = (long)np;
1255
1256 /*
1257 * DB# stack, runs on ist4.
1258 */
1259 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1260 np->np_pcpu = (register_t)pc;
1261 tssp->tss_ist4 = (long)np;
1262 }
1263
1264 /*
1265 * Calculate the kernel load address by inspecting page table created by loader.
1266 * The assumptions:
1267 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1268 * aligned at 2M, below 4G (the latter is important for AP startup)
1269 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1270 * - kernel is mapped with 2M superpages
1271 * - all participating memory, i.e. kernel, modules, metadata,
1272 * page table is accessible by pre-created 1:1 mapping
1273 * (right now loader creates 1:1 mapping for lower 4G, and all
1274 * memory is from there)
1275 * - there is a usable memory block right after the end of the
1276 * mapped kernel and all modules/metadata, pointed to by
1277 * physfree, for early allocations
1278 */
1279 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1280 amd64_loadaddr(void)
1281 {
1282 pml4_entry_t *pml4e;
1283 pdp_entry_t *pdpe;
1284 pd_entry_t *pde;
1285 uint64_t cr3;
1286
1287 cr3 = rcr3();
1288 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1289 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1290 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1291 return (*pde & PG_FRAME);
1292 }
1293
1294 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1295 hammer_time(u_int64_t modulep, u_int64_t physfree)
1296 {
1297 int gsel_tss, x;
1298 struct pcpu *pc;
1299 uint64_t rsp0;
1300 char *env;
1301 struct user_segment_descriptor *gdt;
1302 struct region_descriptor r_gdt;
1303 size_t kstack0_sz;
1304
1305 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1306
1307 kernphys = amd64_loadaddr();
1308
1309 physfree += kernphys;
1310
1311 /* Initializes preload_kmdp */
1312 init_ops.parse_preload_data(modulep);
1313
1314 efi_boot = preload_search_info(preload_kmdp, MODINFO_METADATA |
1315 MODINFOMD_EFI_MAP) != NULL;
1316
1317 if (!efi_boot) {
1318 /* Tell the bios to warmboot next time */
1319 atomic_store_short((u_short *)0x472, 0x1234);
1320 }
1321
1322 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1323 physfree = roundup2(physfree, PAGE_SIZE);
1324
1325 identify_cpu1();
1326 identify_hypervisor();
1327 identify_hypervisor_smbios();
1328 identify_cpu_fixup_bsp();
1329 identify_cpu2();
1330 initializecpucache();
1331
1332 /*
1333 * Check for pti, pcid, and invpcid before ifuncs are
1334 * resolved, to correctly select the implementation for
1335 * pmap_activate_sw_mode().
1336 */
1337 pti = pti_get_default();
1338 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1339 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1340 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1341 invpcid_works = (cpu_stdext_feature &
1342 CPUID_STDEXT_INVPCID) != 0;
1343 } else {
1344 pmap_pcid_enabled = 0;
1345 }
1346
1347 /*
1348 * Now we can do small core initialization, after the PCID
1349 * CPU features and user knobs are evaluated.
1350 */
1351 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1352 &pmap_pcid_invlpg_workaround_uena);
1353 cpu_init_small_core();
1354
1355 if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1356 use_xsave = 1;
1357 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1358 }
1359
1360 link_elf_ireloc();
1361
1362 /*
1363 * This may be done better later if it gets more high level
1364 * components in it. If so just link td->td_proc here.
1365 */
1366 proc_linkup0(&proc0, &thread0);
1367
1368 /* Init basic tunables, hz etc */
1369 init_param1();
1370
1371 thread0.td_kstack = physfree - kernphys + KERNSTART;
1372 thread0.td_kstack_pages = kstack_pages;
1373 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1374 bzero((void *)thread0.td_kstack, kstack0_sz);
1375 physfree += kstack0_sz;
1376
1377 /*
1378 * Initialize enough of thread0 for delayed invalidation to
1379 * work very early. Rely on thread0.td_base_pri
1380 * zero-initialization, it is reset to PVM at proc0_init().
1381 */
1382 pmap_thread_init_invl_gen(&thread0);
1383
1384 pc = &temp_bsp_pcpu;
1385 pcpu_init(pc, 0, sizeof(struct pcpu));
1386 gdt = &temp_bsp_pcpu.pc_gdt[0];
1387
1388 /*
1389 * make gdt memory segments
1390 */
1391 for (x = 0; x < NGDT; x++) {
1392 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1393 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1394 ssdtosd(&gdt_segs[x], &gdt[x]);
1395 }
1396 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1397 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1398 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1399
1400 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1401 r_gdt.rd_base = (long)gdt;
1402 lgdt(&r_gdt);
1403
1404 wrmsr(MSR_FSBASE, 0); /* User value */
1405 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1406 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1407
1408 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1409 physfree += DPCPU_SIZE;
1410 amd64_bsp_pcpu_init1(pc);
1411 /* Non-late cninit() and printf() can be moved up to here. */
1412
1413 /*
1414 * Initialize mutexes.
1415 *
1416 * icu_lock: in order to allow an interrupt to occur in a critical
1417 * section, to set pcpu->ipending (etc...) properly, we
1418 * must be able to get the icu lock, so it can't be
1419 * under witness.
1420 */
1421 mutex_init();
1422 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1423 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1424
1425 /* exceptions */
1426 for (x = 0; x < NIDT; x++)
1427 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1428 SEL_KPL, 0);
1429 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1430 SEL_KPL, 0);
1431 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1432 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1433 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1434 SEL_UPL, 0);
1435 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1436 SEL_UPL, 0);
1437 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1438 SEL_KPL, 0);
1439 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1440 SEL_KPL, 0);
1441 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1442 SEL_KPL, 0);
1443 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1444 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1445 SDT_SYSIGT, SEL_KPL, 0);
1446 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1447 SEL_KPL, 0);
1448 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1449 SDT_SYSIGT, SEL_KPL, 0);
1450 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1451 SEL_KPL, 0);
1452 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1453 SEL_KPL, 0);
1454 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1455 SEL_KPL, 0);
1456 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1457 SEL_KPL, 0);
1458 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1459 SEL_KPL, 0);
1460 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1461 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1462 SEL_KPL, 0);
1463 #ifdef KDTRACE_HOOKS
1464 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1465 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1466 #endif
1467 #ifdef XENHVM
1468 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1469 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1470 #endif
1471 r_idt.rd_limit = sizeof(idt0) - 1;
1472 r_idt.rd_base = (long) idt;
1473 lidt(&r_idt);
1474
1475 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1476 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1477
1478 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1479 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1480
1481 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1482 &syscall_ret_l1d_flush_mode);
1483
1484 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1485 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1486
1487 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1488
1489 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1490 &x86_rngds_mitg_enable);
1491
1492 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1493 &zenbleed_enable);
1494 zenbleed_sanitize_enable();
1495
1496 finishidentcpu(); /* Final stage of CPU initialization */
1497
1498 invlpgb_works = (amd_extended_feature_extensions &
1499 AMDFEID_INVLPGB) != 0;
1500 TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1501 if (invlpgb_works)
1502 invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1503
1504 /*
1505 * Initialize the clock before the console so that console
1506 * initialization can use DELAY().
1507 */
1508 clock_init();
1509
1510 initializecpu(); /* Initialize CPU registers */
1511
1512 amd64_bsp_ist_init(pc);
1513
1514 /* Set the IO permission bitmap (empty due to tss seg limit) */
1515 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1516 IOPERM_BITMAP_SIZE;
1517
1518 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1519 ltr(gsel_tss);
1520
1521 amd64_conf_fast_syscall();
1522
1523 /*
1524 * We initialize the PCB pointer early so that exception
1525 * handlers will work. Also set up td_critnest to short-cut
1526 * the page fault handler.
1527 */
1528 cpu_max_ext_state_size = sizeof(struct savefpu);
1529 set_top_of_stack_td(&thread0);
1530 thread0.td_pcb = get_pcb_td(&thread0);
1531 thread0.td_critnest = 1;
1532
1533 /*
1534 * The console and kdb should be initialized even earlier than here,
1535 * but some console drivers don't work until after getmemsize().
1536 * Default to late console initialization to support these drivers.
1537 * This loses mainly printf()s in getmemsize() and early debugging.
1538 */
1539 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1540 if (!late_console) {
1541 cninit();
1542 amd64_kdb_init();
1543 }
1544
1545 getmemsize(physfree);
1546 init_param2(physmem);
1547
1548 /* now running on new page tables, configured,and u/iom is accessible */
1549
1550 #ifdef DEV_PCI
1551 /* This call might adjust phys_avail[]. */
1552 pci_early_quirks();
1553 #endif
1554
1555 if (late_console)
1556 cninit();
1557
1558 /*
1559 * Dump the boot metadata. We have to wait for cninit() since console
1560 * output is required. If it's grossly incorrect the kernel will never
1561 * make it this far.
1562 */
1563 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1564 preload_dump();
1565
1566 #ifdef DEV_ISA
1567 #ifdef DEV_ATPIC
1568 elcr_probe();
1569 atpic_startup();
1570 #else
1571 /* Reset and mask the atpics and leave them shut down. */
1572 atpic_reset();
1573
1574 /*
1575 * Point the ICU spurious interrupt vectors at the APIC spurious
1576 * interrupt handler.
1577 */
1578 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1579 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1580 #endif
1581 #else
1582 #error "have you forgotten the isa device?"
1583 #endif
1584
1585 if (late_console)
1586 amd64_kdb_init();
1587
1588 msgbufinit(msgbufp, msgbufsize);
1589 fpuinit();
1590
1591 /* make an initial tss so cpu can get interrupt stack on syscall! */
1592 rsp0 = thread0.td_md.md_stack_base;
1593 /* Ensure the stack is aligned to 16 bytes */
1594 rsp0 &= ~0xFul;
1595 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1596 amd64_bsp_pcpu_init2(rsp0);
1597
1598 /* transfer to user mode */
1599
1600 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1601 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1602 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1603 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1604 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1605
1606 load_ds(_udatasel);
1607 load_es(_udatasel);
1608 load_fs(_ufssel);
1609
1610 /* setup proc 0's pcb */
1611 thread0.td_pcb->pcb_flags = 0;
1612
1613 env = kern_getenv("kernelname");
1614 if (env != NULL)
1615 strlcpy(kernelname, env, sizeof(kernelname));
1616
1617 kcsan_cpu_init(0);
1618
1619 #ifdef FDT
1620 x86_init_fdt();
1621 #endif
1622 thread0.td_critnest = 0;
1623
1624 kasan_init();
1625 kmsan_init();
1626
1627 TSEXIT();
1628
1629 /* Location of kernel stack for locore */
1630 return (thread0.td_md.md_stack_base);
1631 }
1632
1633 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1634 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1635 {
1636
1637 pcpu->pc_acpi_id = 0xffffffff;
1638 }
1639
1640 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1641 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1642 {
1643 struct bios_smap *smapbase;
1644 struct bios_smap_xattr smap;
1645 uint32_t *smapattr;
1646 int count, error, i;
1647
1648 /* Retrieve the system memory map from the loader. */
1649 smapbase = (struct bios_smap *)preload_search_info(preload_kmdp,
1650 MODINFO_METADATA | MODINFOMD_SMAP);
1651 if (smapbase == NULL)
1652 return (0);
1653 smapattr = (uint32_t *)preload_search_info(preload_kmdp,
1654 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1655 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1656 error = 0;
1657 for (i = 0; i < count; i++) {
1658 smap.base = smapbase[i].base;
1659 smap.length = smapbase[i].length;
1660 smap.type = smapbase[i].type;
1661 if (smapattr != NULL)
1662 smap.xattr = smapattr[i];
1663 else
1664 smap.xattr = 0;
1665 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1666 }
1667 return (error);
1668 }
1669 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1670 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1671 smap_sysctl_handler, "S,bios_smap_xattr",
1672 "Raw BIOS SMAP data");
1673
1674 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1675 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1676 {
1677 struct efi_map_header *efihdr;
1678 uint32_t efisize;
1679
1680 efihdr = (struct efi_map_header *)preload_search_info(preload_kmdp,
1681 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1682 if (efihdr == NULL)
1683 return (0);
1684 efisize = *((uint32_t *)efihdr - 1);
1685 return (SYSCTL_OUT(req, efihdr, efisize));
1686 }
1687 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1688 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1689 efi_map_sysctl_handler, "S,efi_map_header",
1690 "Raw EFI Memory Map");
1691
1692 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1693 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1694 {
1695 char *arch;
1696
1697 arch = (char *)preload_search_info(preload_kmdp,
1698 MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1699 if (arch == NULL)
1700 return (0);
1701
1702 return (SYSCTL_OUT_STR(req, arch));
1703 }
1704 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1705 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1706 efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1707
1708 void
spinlock_enter(void)1709 spinlock_enter(void)
1710 {
1711 struct thread *td;
1712 register_t flags;
1713
1714 td = curthread;
1715 if (td->td_md.md_spinlock_count == 0) {
1716 flags = intr_disable();
1717 td->td_md.md_spinlock_count = 1;
1718 td->td_md.md_saved_flags = flags;
1719 critical_enter();
1720 } else
1721 td->td_md.md_spinlock_count++;
1722 }
1723
1724 void
spinlock_exit(void)1725 spinlock_exit(void)
1726 {
1727 struct thread *td;
1728 register_t flags;
1729
1730 td = curthread;
1731 flags = td->td_md.md_saved_flags;
1732 td->td_md.md_spinlock_count--;
1733 if (td->td_md.md_spinlock_count == 0) {
1734 critical_exit();
1735 intr_restore(flags);
1736 }
1737 }
1738
1739 /*
1740 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1741 * we want to start a backtrace from the function that caused us to enter
1742 * the debugger. We have the context in the trapframe, but base the trace
1743 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1744 * enough for a backtrace.
1745 */
1746 void
makectx(struct trapframe * tf,struct pcb * pcb)1747 makectx(struct trapframe *tf, struct pcb *pcb)
1748 {
1749
1750 pcb->pcb_r12 = tf->tf_r12;
1751 pcb->pcb_r13 = tf->tf_r13;
1752 pcb->pcb_r14 = tf->tf_r14;
1753 pcb->pcb_r15 = tf->tf_r15;
1754 pcb->pcb_rbp = tf->tf_rbp;
1755 pcb->pcb_rbx = tf->tf_rbx;
1756 pcb->pcb_rip = tf->tf_rip;
1757 pcb->pcb_rsp = tf->tf_rsp;
1758 }
1759
1760 /*
1761 * The pcb_flags is only modified by current thread, or by other threads
1762 * when current thread is stopped. However, current thread may change it
1763 * from the interrupt context in cpu_switch(), or in the trap handler.
1764 * When we read-modify-write pcb_flags from C sources, compiler may generate
1765 * code that is not atomic regarding the interrupt handler. If a trap or
1766 * interrupt happens and any flag is modified from the handler, it can be
1767 * clobbered with the cached value later. Therefore, we implement setting
1768 * and clearing flags with single-instruction functions, which do not race
1769 * with possible modification of the flags from the trap or interrupt context,
1770 * because traps and interrupts are executed only on instruction boundary.
1771 */
1772 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1773 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1774 {
1775
1776 __asm __volatile("orl %1,%0"
1777 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1778 : "cc", "memory");
1779
1780 }
1781
1782 /*
1783 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1784 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1785 * pcb if user space modified the bases. We must save on the context
1786 * switch or if the return to usermode happens through the doreti.
1787 *
1788 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1789 * which have a consequence that the base MSRs must be saved each time
1790 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
1791 * context switches.
1792 */
1793 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1794 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1795 {
1796 register_t r;
1797
1798 if (curpcb == pcb &&
1799 (flags & PCB_FULL_IRET) != 0 &&
1800 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1801 r = intr_disable();
1802 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1803 pcb->pcb_fsbase = rdfsbase();
1804 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1805 }
1806 set_pcb_flags_raw(pcb, flags);
1807 intr_restore(r);
1808 } else {
1809 set_pcb_flags_raw(pcb, flags);
1810 }
1811 }
1812
1813 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1814 {
1815
1816 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1817 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1818 }
1819
1820 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1821 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1822 {
1823
1824 __asm __volatile("andl %1,%0"
1825 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1826 : "cc", "memory");
1827 }
1828
1829 #ifdef KDB
1830
1831 /*
1832 * Provide inb() and outb() as functions. They are normally only available as
1833 * inline functions, thus cannot be called from the debugger.
1834 */
1835
1836 /* silence compiler warnings */
1837 u_char inb_(u_short);
1838 void outb_(u_short, u_char);
1839
1840 u_char
inb_(u_short port)1841 inb_(u_short port)
1842 {
1843 return inb(port);
1844 }
1845
1846 void
outb_(u_short port,u_char data)1847 outb_(u_short port, u_char data)
1848 {
1849 outb(port, data);
1850 }
1851
1852 #endif /* KDB */
1853
1854 #undef memset
1855 #undef memmove
1856 #undef memcpy
1857
1858 void *memset_std(void *buf, int c, size_t len);
1859 void *memset_erms(void *buf, int c, size_t len);
1860 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1861 size_t len);
1862 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1863 size_t len);
1864 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1865 size_t len);
1866 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1867 size_t len);
1868
1869 #ifdef KCSAN
1870 /*
1871 * These fail to build as ifuncs when used with KCSAN.
1872 */
1873 void *
memset(void * buf,int c,size_t len)1874 memset(void *buf, int c, size_t len)
1875 {
1876
1877 return (memset_std(buf, c, len));
1878 }
1879
1880 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1881 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1882 {
1883
1884 return (memmove_std(dst, src, len));
1885 }
1886
1887 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1888 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1889 {
1890
1891 return (memcpy_std(dst, src, len));
1892 }
1893 #else
1894 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1895 {
1896
1897 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1898 memset_erms : memset_std);
1899 }
1900
1901 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1902 size_t))
1903 {
1904
1905 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1906 memmove_erms : memmove_std);
1907 }
1908
1909 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1910 {
1911
1912 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1913 memcpy_erms : memcpy_std);
1914 }
1915 #endif
1916
1917 void pagezero_std(void *addr);
1918 void pagezero_erms(void *addr);
1919 DEFINE_IFUNC(, void , pagezero, (void *))
1920 {
1921
1922 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1923 pagezero_erms : pagezero_std);
1924 }
1925