1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113
114 #include <net/netisr.h>
115
116 #include <dev/smbios/smbios.h>
117
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155 /*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170
171 /* Preload data parse function */
172 static caddr_t native_parse_preload_data(u_int64_t);
173
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
176
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 .parse_preload_data = native_parse_preload_data,
180 .early_clock_source_init = native_clock_source_init,
181 .early_delay = i8254_delay,
182 .parse_memmap = native_parse_memmap,
183 };
184
185 /*
186 * Physical address of the EFI System Table. Stashed from the metadata hints
187 * passed into the kernel and used by the EFI code to call runtime services.
188 */
189 vm_paddr_t efi_systbl_phys;
190
191 /* Intel ICH registers */
192 #define ICH_PMBASE 0x400
193 #define ICH_SMI_EN ICH_PMBASE + 0x30
194
195 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
196
197 int cold = 1;
198
199 long Maxmem = 0;
200 long realmem = 0;
201 int late_console = 1;
202
203 struct kva_md_info kmi;
204
205 struct region_descriptor r_idt;
206
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209
210 struct mtx icu_lock;
211
212 struct mem_range_softc mem_range_softc;
213
214 struct mtx dt_lock; /* lock for GDT and LDT */
215
216 void (*vmm_resume_p)(void);
217
218 bool efi_boot;
219
220 static void
cpu_startup(void * dummy)221 cpu_startup(void *dummy)
222 {
223 uintmax_t memsize;
224 char *sysenv;
225
226 /*
227 * On MacBooks, we need to disallow the legacy USB circuit to
228 * generate an SMI# because this can cause several problems,
229 * namely: incorrect CPU frequency detection and failure to
230 * start the APs.
231 * We do this by disabling a bit in the SMI_EN (SMI Control and
232 * Enable register) of the Intel ICH LPC Interface Bridge.
233 */
234 sysenv = kern_getenv("smbios.system.product");
235 if (sysenv != NULL) {
236 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
237 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
238 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
239 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
240 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
241 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
242 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
243 strncmp(sysenv, "Macmini1,1", 10) == 0) {
244 if (bootverbose)
245 printf("Disabling LEGACY_USB_EN bit on "
246 "Intel ICH.\n");
247 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
248 }
249 freeenv(sysenv);
250 }
251
252 /*
253 * Good {morning,afternoon,evening,night}.
254 */
255 startrtclock();
256 printcpuinfo();
257
258 /*
259 * Display physical memory if SMBIOS reports reasonable amount.
260 */
261 memsize = 0;
262 sysenv = kern_getenv("smbios.memory.enabled");
263 if (sysenv != NULL) {
264 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
265 freeenv(sysenv);
266 }
267 if (memsize < ptoa((uintmax_t)vm_free_count()))
268 memsize = ptoa((uintmax_t)Maxmem);
269 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
270 realmem = atop(memsize);
271
272 /*
273 * Display any holes after the first chunk of extended memory.
274 */
275 if (bootverbose) {
276 int indx;
277
278 printf("Physical memory chunk(s):\n");
279 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
280 vm_paddr_t size;
281
282 size = phys_avail[indx + 1] - phys_avail[indx];
283 printf(
284 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
285 (uintmax_t)phys_avail[indx],
286 (uintmax_t)phys_avail[indx + 1] - 1,
287 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
288 }
289 }
290
291 vm_ksubmap_init(&kmi);
292
293 printf("avail memory = %ju (%ju MB)\n",
294 ptoa((uintmax_t)vm_free_count()),
295 ptoa((uintmax_t)vm_free_count()) / 1048576);
296 #ifdef DEV_PCI
297 if (bootverbose && intel_graphics_stolen_base != 0)
298 printf("intel stolen mem: base %#jx size %ju MB\n",
299 (uintmax_t)intel_graphics_stolen_base,
300 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
301 #endif
302
303 /*
304 * Set up buffers, so they can be used to read disk labels.
305 */
306 bufinit();
307 vm_pager_bufferinit();
308
309 cpu_setregs();
310 }
311
312 static void
late_ifunc_resolve(void * dummy __unused)313 late_ifunc_resolve(void *dummy __unused)
314 {
315 link_elf_late_ireloc();
316 }
317 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
318
319
320 void
cpu_setregs(void)321 cpu_setregs(void)
322 {
323 register_t cr0;
324
325 TSENTER();
326 cr0 = rcr0();
327 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
328 TSENTER2("load_cr0");
329 load_cr0(cr0);
330 TSEXIT2("load_cr0");
331 TSEXIT();
332 }
333
334 /*
335 * Initialize amd64 and configure to run kernel
336 */
337
338 /*
339 * Initialize segments & interrupt table
340 */
341 static struct gate_descriptor idt0[NIDT];
342 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
343
344 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
345 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
346 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
347 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
348 CTASSERT(sizeof(struct nmi_pcpu) == 16);
349
350 /*
351 * Software prototypes -- in more palatable form.
352 *
353 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
354 * slots as corresponding segments for i386 kernel.
355 */
356 struct soft_segment_descriptor gdt_segs[] = {
357 [GNULL_SEL] = { /* 0 Null Descriptor */
358 .ssd_base = 0x0,
359 .ssd_limit = 0x0,
360 .ssd_type = 0,
361 .ssd_dpl = 0,
362 .ssd_p = 0,
363 .ssd_long = 0,
364 .ssd_def32 = 0,
365 .ssd_gran = 0 },
366 [GNULL2_SEL] = { /* 1 Null Descriptor */
367 .ssd_base = 0x0,
368 .ssd_limit = 0x0,
369 .ssd_type = 0,
370 .ssd_dpl = 0,
371 .ssd_p = 0,
372 .ssd_long = 0,
373 .ssd_def32 = 0,
374 .ssd_gran = 0 },
375 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
376 .ssd_base = 0x0,
377 .ssd_limit = 0xfffff,
378 .ssd_type = SDT_MEMRWA,
379 .ssd_dpl = SEL_UPL,
380 .ssd_p = 1,
381 .ssd_long = 0,
382 .ssd_def32 = 1,
383 .ssd_gran = 1 },
384 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
385 .ssd_base = 0x0,
386 .ssd_limit = 0xfffff,
387 .ssd_type = SDT_MEMRWA,
388 .ssd_dpl = SEL_UPL,
389 .ssd_p = 1,
390 .ssd_long = 0,
391 .ssd_def32 = 1,
392 .ssd_gran = 1 },
393 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
394 .ssd_base = 0x0,
395 .ssd_limit = 0xfffff,
396 .ssd_type = SDT_MEMERA,
397 .ssd_dpl = SEL_KPL,
398 .ssd_p = 1,
399 .ssd_long = 1,
400 .ssd_def32 = 0,
401 .ssd_gran = 1 },
402 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
403 .ssd_base = 0x0,
404 .ssd_limit = 0xfffff,
405 .ssd_type = SDT_MEMRWA,
406 .ssd_dpl = SEL_KPL,
407 .ssd_p = 1,
408 .ssd_long = 1,
409 .ssd_def32 = 0,
410 .ssd_gran = 1 },
411 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
412 .ssd_base = 0x0,
413 .ssd_limit = 0xfffff,
414 .ssd_type = SDT_MEMERA,
415 .ssd_dpl = SEL_UPL,
416 .ssd_p = 1,
417 .ssd_long = 0,
418 .ssd_def32 = 1,
419 .ssd_gran = 1 },
420 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
421 .ssd_base = 0x0,
422 .ssd_limit = 0xfffff,
423 .ssd_type = SDT_MEMRWA,
424 .ssd_dpl = SEL_UPL,
425 .ssd_p = 1,
426 .ssd_long = 0,
427 .ssd_def32 = 1,
428 .ssd_gran = 1 },
429 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
430 .ssd_base = 0x0,
431 .ssd_limit = 0xfffff,
432 .ssd_type = SDT_MEMERA,
433 .ssd_dpl = SEL_UPL,
434 .ssd_p = 1,
435 .ssd_long = 1,
436 .ssd_def32 = 0,
437 .ssd_gran = 1 },
438 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
439 .ssd_base = 0x0,
440 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
441 .ssd_type = SDT_SYSTSS,
442 .ssd_dpl = SEL_KPL,
443 .ssd_p = 1,
444 .ssd_long = 0,
445 .ssd_def32 = 0,
446 .ssd_gran = 0 },
447 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
448 .ssd_base = 0x0,
449 .ssd_limit = 0x0,
450 .ssd_type = 0,
451 .ssd_dpl = 0,
452 .ssd_p = 0,
453 .ssd_long = 0,
454 .ssd_def32 = 0,
455 .ssd_gran = 0 },
456 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
457 .ssd_base = 0x0,
458 .ssd_limit = 0x0,
459 .ssd_type = 0,
460 .ssd_dpl = 0,
461 .ssd_p = 0,
462 .ssd_long = 0,
463 .ssd_def32 = 0,
464 .ssd_gran = 0 },
465 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
466 .ssd_base = 0x0,
467 .ssd_limit = 0x0,
468 .ssd_type = 0,
469 .ssd_dpl = 0,
470 .ssd_p = 0,
471 .ssd_long = 0,
472 .ssd_def32 = 0,
473 .ssd_gran = 0 },
474 };
475 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
476
477 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)478 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
479 {
480 struct gate_descriptor *ip;
481
482 ip = idt + idx;
483 ip->gd_looffset = (uintptr_t)func;
484 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
485 ip->gd_ist = ist;
486 ip->gd_xx = 0;
487 ip->gd_type = typ;
488 ip->gd_dpl = dpl;
489 ip->gd_p = 1;
490 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
491 }
492
493 extern inthand_t
494 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
495 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
496 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
497 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
498 IDTVEC(xmm), IDTVEC(dblfault),
499 IDTVEC(div_pti), IDTVEC(bpt_pti),
500 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
501 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
502 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
503 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
504 IDTVEC(xmm_pti),
505 #ifdef KDTRACE_HOOKS
506 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
507 #endif
508 #ifdef XENHVM
509 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
510 #endif
511 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
512 IDTVEC(fast_syscall_pti);
513
514 #ifdef DDB
515 /*
516 * Display the index and function name of any IDT entries that don't use
517 * the default 'rsvd' entry point.
518 */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)519 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
520 {
521 struct gate_descriptor *ip;
522 int idx;
523 uintptr_t func;
524
525 ip = idt;
526 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
527 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
528 if (func != (uintptr_t)&IDTVEC(rsvd)) {
529 db_printf("%3d\t", idx);
530 db_printsym(func, DB_STGY_PROC);
531 db_printf("\n");
532 }
533 ip++;
534 }
535 }
536
537 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)538 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
539 {
540 struct {
541 uint16_t limit;
542 uint64_t base;
543 } __packed idtr, gdtr;
544 uint16_t ldt, tr;
545
546 __asm __volatile("sidt %0" : "=m" (idtr));
547 db_printf("idtr\t0x%016lx/%04x\n",
548 (u_long)idtr.base, (u_int)idtr.limit);
549 __asm __volatile("sgdt %0" : "=m" (gdtr));
550 db_printf("gdtr\t0x%016lx/%04x\n",
551 (u_long)gdtr.base, (u_int)gdtr.limit);
552 __asm __volatile("sldt %0" : "=r" (ldt));
553 db_printf("ldtr\t0x%04x\n", ldt);
554 __asm __volatile("str %0" : "=r" (tr));
555 db_printf("tr\t0x%04x\n", tr);
556 db_printf("cr0\t0x%016lx\n", rcr0());
557 db_printf("cr2\t0x%016lx\n", rcr2());
558 db_printf("cr3\t0x%016lx\n", rcr3());
559 db_printf("cr4\t0x%016lx\n", rcr4());
560 if (rcr4() & CR4_XSAVE)
561 db_printf("xcr0\t0x%016lx\n", rxcr(0));
562 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
563 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
564 db_printf("FEATURES_CTL\t%016lx\n",
565 rdmsr(MSR_IA32_FEATURE_CONTROL));
566 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
567 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
568 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
569 }
570
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)571 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
572 {
573
574 db_printf("dr0\t0x%016lx\n", rdr0());
575 db_printf("dr1\t0x%016lx\n", rdr1());
576 db_printf("dr2\t0x%016lx\n", rdr2());
577 db_printf("dr3\t0x%016lx\n", rdr3());
578 db_printf("dr6\t0x%016lx\n", rdr6());
579 db_printf("dr7\t0x%016lx\n", rdr7());
580 }
581 #endif
582
583 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)584 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
585 {
586
587 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
588 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
589 ssd->ssd_type = sd->sd_type;
590 ssd->ssd_dpl = sd->sd_dpl;
591 ssd->ssd_p = sd->sd_p;
592 ssd->ssd_long = sd->sd_long;
593 ssd->ssd_def32 = sd->sd_def32;
594 ssd->ssd_gran = sd->sd_gran;
595 }
596
597 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)598 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
599 {
600
601 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
602 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
603 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
604 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
605 sd->sd_type = ssd->ssd_type;
606 sd->sd_dpl = ssd->ssd_dpl;
607 sd->sd_p = ssd->ssd_p;
608 sd->sd_long = ssd->ssd_long;
609 sd->sd_def32 = ssd->ssd_def32;
610 sd->sd_gran = ssd->ssd_gran;
611 }
612
613 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)614 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
615 {
616
617 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
618 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
619 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
620 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
621 sd->sd_type = ssd->ssd_type;
622 sd->sd_dpl = ssd->ssd_dpl;
623 sd->sd_p = ssd->ssd_p;
624 sd->sd_gran = ssd->ssd_gran;
625 }
626
627 u_int basemem;
628
629 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)630 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
631 int *physmap_idxp)
632 {
633 int i, insert_idx, physmap_idx;
634
635 physmap_idx = *physmap_idxp;
636
637 if (length == 0)
638 return (1);
639
640 /*
641 * Find insertion point while checking for overlap. Start off by
642 * assuming the new entry will be added to the end.
643 *
644 * NB: physmap_idx points to the next free slot.
645 */
646 insert_idx = physmap_idx;
647 for (i = 0; i <= physmap_idx; i += 2) {
648 if (base < physmap[i + 1]) {
649 if (base + length <= physmap[i]) {
650 insert_idx = i;
651 break;
652 }
653 if (boothowto & RB_VERBOSE)
654 printf(
655 "Overlapping memory regions, ignoring second region\n");
656 return (1);
657 }
658 }
659
660 /* See if we can prepend to the next entry. */
661 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
662 physmap[insert_idx] = base;
663 return (1);
664 }
665
666 /* See if we can append to the previous entry. */
667 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
668 physmap[insert_idx - 1] += length;
669 return (1);
670 }
671
672 physmap_idx += 2;
673 *physmap_idxp = physmap_idx;
674 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
675 printf(
676 "Too many segments in the physical address map, giving up\n");
677 return (0);
678 }
679
680 /*
681 * Move the last 'N' entries down to make room for the new
682 * entry if needed.
683 */
684 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
685 physmap[i] = physmap[i - 2];
686 physmap[i + 1] = physmap[i - 1];
687 }
688
689 /* Insert the new entry. */
690 physmap[insert_idx] = base;
691 physmap[insert_idx + 1] = base + length;
692 return (1);
693 }
694
695 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)696 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
697 vm_paddr_t *physmap, int *physmap_idx)
698 {
699 struct bios_smap *smap, *smapend;
700
701 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
702
703 for (smap = smapbase; smap < smapend; smap++) {
704 if (boothowto & RB_VERBOSE)
705 printf("SMAP type=%02x base=%016lx len=%016lx\n",
706 smap->type, smap->base, smap->length);
707
708 if (smap->type != SMAP_TYPE_MEMORY)
709 continue;
710
711 if (!add_physmap_entry(smap->base, smap->length, physmap,
712 physmap_idx))
713 break;
714 }
715 }
716
717 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)718 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
719 int *physmap_idx)
720 {
721 struct efi_md *map, *p;
722 const char *type;
723 size_t efisz;
724 int ndesc, i;
725
726 static const char *types[] = {
727 "Reserved",
728 "LoaderCode",
729 "LoaderData",
730 "BootServicesCode",
731 "BootServicesData",
732 "RuntimeServicesCode",
733 "RuntimeServicesData",
734 "ConventionalMemory",
735 "UnusableMemory",
736 "ACPIReclaimMemory",
737 "ACPIMemoryNVS",
738 "MemoryMappedIO",
739 "MemoryMappedIOPortSpace",
740 "PalCode",
741 "PersistentMemory"
742 };
743
744 /*
745 * Memory map data provided by UEFI via the GetMemoryMap
746 * Boot Services API.
747 */
748 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
749 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
750
751 if (efihdr->descriptor_size == 0)
752 return;
753 ndesc = efihdr->memory_size / efihdr->descriptor_size;
754
755 if (boothowto & RB_VERBOSE)
756 printf("%23s %12s %12s %8s %4s\n",
757 "Type", "Physical", "Virtual", "#Pages", "Attr");
758
759 for (i = 0, p = map; i < ndesc; i++,
760 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
761 if (boothowto & RB_VERBOSE) {
762 if (p->md_type < nitems(types))
763 type = types[p->md_type];
764 else
765 type = "<INVALID>";
766 printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
767 p->md_virt, p->md_pages);
768 if (p->md_attr & EFI_MD_ATTR_UC)
769 printf("UC ");
770 if (p->md_attr & EFI_MD_ATTR_WC)
771 printf("WC ");
772 if (p->md_attr & EFI_MD_ATTR_WT)
773 printf("WT ");
774 if (p->md_attr & EFI_MD_ATTR_WB)
775 printf("WB ");
776 if (p->md_attr & EFI_MD_ATTR_UCE)
777 printf("UCE ");
778 if (p->md_attr & EFI_MD_ATTR_WP)
779 printf("WP ");
780 if (p->md_attr & EFI_MD_ATTR_RP)
781 printf("RP ");
782 if (p->md_attr & EFI_MD_ATTR_XP)
783 printf("XP ");
784 if (p->md_attr & EFI_MD_ATTR_NV)
785 printf("NV ");
786 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
787 printf("MORE_RELIABLE ");
788 if (p->md_attr & EFI_MD_ATTR_RO)
789 printf("RO ");
790 if (p->md_attr & EFI_MD_ATTR_RT)
791 printf("RUNTIME");
792 printf("\n");
793 }
794
795 switch (p->md_type) {
796 case EFI_MD_TYPE_CODE:
797 case EFI_MD_TYPE_DATA:
798 case EFI_MD_TYPE_BS_CODE:
799 case EFI_MD_TYPE_BS_DATA:
800 case EFI_MD_TYPE_FREE:
801 /*
802 * We're allowed to use any entry with these types.
803 */
804 break;
805 default:
806 continue;
807 }
808
809 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
810 physmap, physmap_idx))
811 break;
812 }
813 }
814
815 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)816 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
817 {
818 struct bios_smap *smap;
819 struct efi_map_header *efihdr;
820 u_int32_t size;
821
822 /*
823 * Memory map from INT 15:E820.
824 *
825 * subr_module.c says:
826 * "Consumer may safely assume that size value precedes data."
827 * ie: an int32_t immediately precedes smap.
828 */
829
830 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
831 MODINFO_METADATA | MODINFOMD_EFI_MAP);
832 smap = (struct bios_smap *)preload_search_info(kmdp,
833 MODINFO_METADATA | MODINFOMD_SMAP);
834 if (efihdr == NULL && smap == NULL)
835 panic("No BIOS smap or EFI map info from loader!");
836
837 if (efihdr != NULL) {
838 add_efi_map_entries(efihdr, physmap, physmap_idx);
839 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
840 } else {
841 size = *((u_int32_t *)smap - 1);
842 bios_add_smap_entries(smap, size, physmap, physmap_idx);
843 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
844 }
845 }
846
847 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
848
849 /*
850 * Populate the (physmap) array with base/bound pairs describing the
851 * available physical memory in the system, then test this memory and
852 * build the phys_avail array describing the actually-available memory.
853 *
854 * Total memory size may be set by the kernel environment variable
855 * hw.physmem or the compile-time define MAXMEM.
856 *
857 * XXX first should be vm_paddr_t.
858 */
859 static void
getmemsize(caddr_t kmdp,u_int64_t first)860 getmemsize(caddr_t kmdp, u_int64_t first)
861 {
862 int i, physmap_idx, pa_indx, da_indx;
863 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
864 u_long physmem_start, physmem_tunable, memtest;
865 pt_entry_t *pte;
866 quad_t dcons_addr, dcons_size;
867 int page_counter;
868
869 TSENTER();
870 /*
871 * Tell the physical memory allocator about pages used to store
872 * the kernel and preloaded data. See kmem_bootstrap_free().
873 */
874 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
875
876 bzero(physmap, sizeof(physmap));
877 physmap_idx = 0;
878
879 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
880 physmap_idx -= 2;
881
882 /*
883 * Find the 'base memory' segment for SMP
884 */
885 basemem = 0;
886 for (i = 0; i <= physmap_idx; i += 2) {
887 if (physmap[i] <= 0xA0000) {
888 basemem = physmap[i + 1] / 1024;
889 break;
890 }
891 }
892 if (basemem == 0 || basemem > 640) {
893 if (bootverbose)
894 printf(
895 "Memory map doesn't contain a basemem segment, faking it");
896 basemem = 640;
897 }
898
899 /*
900 * Maxmem isn't the "maximum memory", it's one larger than the
901 * highest page of the physical address space. It should be
902 * called something like "Maxphyspage". We may adjust this
903 * based on ``hw.physmem'' and the results of the memory test.
904 */
905 Maxmem = atop(physmap[physmap_idx + 1]);
906
907 #ifdef MAXMEM
908 Maxmem = MAXMEM / 4;
909 #endif
910
911 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
912 Maxmem = atop(physmem_tunable);
913
914 /*
915 * The boot memory test is disabled by default, as it takes a
916 * significant amount of time on large-memory systems, and is
917 * unfriendly to virtual machines as it unnecessarily touches all
918 * pages.
919 *
920 * A general name is used as the code may be extended to support
921 * additional tests beyond the current "page present" test.
922 */
923 memtest = 0;
924 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
925
926 /*
927 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
928 * in the system.
929 */
930 if (Maxmem > atop(physmap[physmap_idx + 1]))
931 Maxmem = atop(physmap[physmap_idx + 1]);
932
933 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
934 (boothowto & RB_VERBOSE))
935 printf("Physical memory use set to %ldK\n", Maxmem * 4);
936
937 /* call pmap initialization to make new kernel address space */
938 pmap_bootstrap(&first);
939
940 /*
941 * Size up each available chunk of physical memory.
942 *
943 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
944 * By default, mask off the first 16 pages unless we appear to be
945 * running in a VM.
946 */
947 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
948 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
949 if (physmap[0] < physmem_start) {
950 if (physmem_start < PAGE_SIZE)
951 physmap[0] = PAGE_SIZE;
952 else if (physmem_start >= physmap[1])
953 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
954 else
955 physmap[0] = round_page(physmem_start);
956 }
957 pa_indx = 0;
958 da_indx = 1;
959 phys_avail[pa_indx++] = physmap[0];
960 phys_avail[pa_indx] = physmap[0];
961 dump_avail[da_indx] = physmap[0];
962 pte = CMAP1;
963
964 /*
965 * Get dcons buffer address
966 */
967 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
968 getenv_quad("dcons.size", &dcons_size) == 0)
969 dcons_addr = 0;
970
971 /*
972 * physmap is in bytes, so when converting to page boundaries,
973 * round up the start address and round down the end address.
974 */
975 page_counter = 0;
976 if (memtest != 0)
977 printf("Testing system memory");
978 for (i = 0; i <= physmap_idx; i += 2) {
979 vm_paddr_t end;
980
981 end = ptoa((vm_paddr_t)Maxmem);
982 if (physmap[i + 1] < end)
983 end = trunc_page(physmap[i + 1]);
984 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
985 int *ptr = (int *)CADDR1;
986 int tmp;
987 bool full, page_bad;
988
989 full = false;
990 /*
991 * block out kernel memory as not available.
992 */
993 if (pa >= (vm_paddr_t)kernphys && pa < first)
994 goto do_dump_avail;
995
996 /*
997 * block out dcons buffer
998 */
999 if (dcons_addr > 0
1000 && pa >= trunc_page(dcons_addr)
1001 && pa < dcons_addr + dcons_size)
1002 goto do_dump_avail;
1003
1004 page_bad = false;
1005 if (memtest == 0)
1006 goto skip_memtest;
1007
1008 /*
1009 * Print a "." every GB to show we're making
1010 * progress.
1011 */
1012 page_counter++;
1013 if ((page_counter % PAGES_PER_GB) == 0)
1014 printf(".");
1015
1016 /*
1017 * map page into kernel: valid, read/write,non-cacheable
1018 */
1019 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1020 invltlb();
1021
1022 tmp = *(int *)ptr;
1023 /*
1024 * Test for alternating 1's and 0's
1025 */
1026 *(volatile int *)ptr = 0xaaaaaaaa;
1027 if (*(volatile int *)ptr != 0xaaaaaaaa)
1028 page_bad = true;
1029 /*
1030 * Test for alternating 0's and 1's
1031 */
1032 *(volatile int *)ptr = 0x55555555;
1033 if (*(volatile int *)ptr != 0x55555555)
1034 page_bad = true;
1035 /*
1036 * Test for all 1's
1037 */
1038 *(volatile int *)ptr = 0xffffffff;
1039 if (*(volatile int *)ptr != 0xffffffff)
1040 page_bad = true;
1041 /*
1042 * Test for all 0's
1043 */
1044 *(volatile int *)ptr = 0x0;
1045 if (*(volatile int *)ptr != 0x0)
1046 page_bad = true;
1047 /*
1048 * Restore original value.
1049 */
1050 *(int *)ptr = tmp;
1051
1052 skip_memtest:
1053 /*
1054 * Adjust array of valid/good pages.
1055 */
1056 if (page_bad == true)
1057 continue;
1058 /*
1059 * If this good page is a continuation of the
1060 * previous set of good pages, then just increase
1061 * the end pointer. Otherwise start a new chunk.
1062 * Note that "end" points one higher than end,
1063 * making the range >= start and < end.
1064 * If we're also doing a speculative memory
1065 * test and we at or past the end, bump up Maxmem
1066 * so that we keep going. The first bad page
1067 * will terminate the loop.
1068 */
1069 if (phys_avail[pa_indx] == pa) {
1070 phys_avail[pa_indx] += PAGE_SIZE;
1071 } else {
1072 pa_indx++;
1073 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1074 printf(
1075 "Too many holes in the physical address space, giving up\n");
1076 pa_indx--;
1077 full = true;
1078 goto do_dump_avail;
1079 }
1080 phys_avail[pa_indx++] = pa; /* start */
1081 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1082 }
1083 physmem++;
1084 do_dump_avail:
1085 if (dump_avail[da_indx] == pa) {
1086 dump_avail[da_indx] += PAGE_SIZE;
1087 } else {
1088 da_indx++;
1089 if (da_indx == PHYS_AVAIL_ENTRIES) {
1090 da_indx--;
1091 goto do_next;
1092 }
1093 dump_avail[da_indx++] = pa; /* start */
1094 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1095 }
1096 do_next:
1097 if (full)
1098 break;
1099 }
1100 }
1101 *pte = 0;
1102 invltlb();
1103 if (memtest != 0)
1104 printf("\n");
1105
1106 /*
1107 * XXX
1108 * The last chunk must contain at least one page plus the message
1109 * buffer to avoid complicating other code (message buffer address
1110 * calculation, etc.).
1111 */
1112 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1113 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1114 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1115 phys_avail[pa_indx--] = 0;
1116 phys_avail[pa_indx--] = 0;
1117 }
1118
1119 Maxmem = atop(phys_avail[pa_indx]);
1120
1121 /* Trim off space for the message buffer. */
1122 phys_avail[pa_indx] -= round_page(msgbufsize);
1123
1124 /* Map the message buffer. */
1125 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1126 TSEXIT();
1127 }
1128
1129 static caddr_t
native_parse_preload_data(u_int64_t modulep)1130 native_parse_preload_data(u_int64_t modulep)
1131 {
1132 caddr_t kmdp;
1133 char *envp;
1134 #ifdef DDB
1135 vm_offset_t ksym_start;
1136 vm_offset_t ksym_end;
1137 #endif
1138
1139 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1140 preload_bootstrap_relocate(KERNBASE);
1141 kmdp = preload_search_by_type("elf kernel");
1142 if (kmdp == NULL)
1143 kmdp = preload_search_by_type("elf64 kernel");
1144 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1145 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1146 if (envp != NULL)
1147 envp += KERNBASE;
1148 init_static_kenv(envp, 0);
1149 #ifdef DDB
1150 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1151 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1152 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1153 #endif
1154 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1155
1156 return (kmdp);
1157 }
1158
1159 static void
native_clock_source_init(void)1160 native_clock_source_init(void)
1161 {
1162 i8254_init();
1163 }
1164
1165 static void
amd64_kdb_init(void)1166 amd64_kdb_init(void)
1167 {
1168 kdb_init();
1169 #ifdef KDB
1170 if (boothowto & RB_KDB)
1171 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1172 #endif
1173 }
1174
1175 /* Set up the fast syscall stuff */
1176 void
amd64_conf_fast_syscall(void)1177 amd64_conf_fast_syscall(void)
1178 {
1179 uint64_t msr;
1180
1181 msr = rdmsr(MSR_EFER) | EFER_SCE;
1182 wrmsr(MSR_EFER, msr);
1183 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1184 (u_int64_t)IDTVEC(fast_syscall));
1185 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1186 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1187 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1188 wrmsr(MSR_STAR, msr);
1189 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1190 }
1191
1192 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1193 amd64_bsp_pcpu_init1(struct pcpu *pc)
1194 {
1195 struct user_segment_descriptor *gdt;
1196
1197 PCPU_SET(prvspace, pc);
1198 gdt = *PCPU_PTR(gdt);
1199 PCPU_SET(curthread, &thread0);
1200 PCPU_SET(tssp, PCPU_PTR(common_tss));
1201 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1202 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1203 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1204 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1205 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1206 PCPU_SET(smp_tlb_gen, 1);
1207 }
1208
1209 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1210 amd64_bsp_pcpu_init2(uint64_t rsp0)
1211 {
1212
1213 PCPU_SET(rsp0, rsp0);
1214 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1215 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1216 PCPU_SET(curpcb, thread0.td_pcb);
1217 }
1218
1219 void
amd64_bsp_ist_init(struct pcpu * pc)1220 amd64_bsp_ist_init(struct pcpu *pc)
1221 {
1222 struct nmi_pcpu *np;
1223 struct amd64tss *tssp;
1224
1225 tssp = &pc->pc_common_tss;
1226
1227 /* doublefault stack space, runs on ist1 */
1228 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1229 np->np_pcpu = (register_t)pc;
1230 tssp->tss_ist1 = (long)np;
1231
1232 /*
1233 * NMI stack, runs on ist2. The pcpu pointer is stored just
1234 * above the start of the ist2 stack.
1235 */
1236 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1237 np->np_pcpu = (register_t)pc;
1238 tssp->tss_ist2 = (long)np;
1239
1240 /*
1241 * MC# stack, runs on ist3. The pcpu pointer is stored just
1242 * above the start of the ist3 stack.
1243 */
1244 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1245 np->np_pcpu = (register_t)pc;
1246 tssp->tss_ist3 = (long)np;
1247
1248 /*
1249 * DB# stack, runs on ist4.
1250 */
1251 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1252 np->np_pcpu = (register_t)pc;
1253 tssp->tss_ist4 = (long)np;
1254 }
1255
1256 /*
1257 * Calculate the kernel load address by inspecting page table created by loader.
1258 * The assumptions:
1259 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1260 * aligned at 2M, below 4G (the latter is important for AP startup)
1261 * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1262 * - kernel is mapped with 2M superpages
1263 * - all participating memory, i.e. kernel, modules, metadata,
1264 * page table is accessible by pre-created 1:1 mapping
1265 * (right now loader creates 1:1 mapping for lower 4G, and all
1266 * memory is from there)
1267 * - there is a usable memory block right after the end of the
1268 * mapped kernel and all modules/metadata, pointed to by
1269 * physfree, for early allocations
1270 */
1271 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1272 amd64_loadaddr(void)
1273 {
1274 pml4_entry_t *pml4e;
1275 pdp_entry_t *pdpe;
1276 pd_entry_t *pde;
1277 uint64_t cr3;
1278
1279 cr3 = rcr3();
1280 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1281 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1282 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1283 return (*pde & PG_FRAME);
1284 }
1285
1286 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1287 hammer_time(u_int64_t modulep, u_int64_t physfree)
1288 {
1289 caddr_t kmdp;
1290 int gsel_tss, x;
1291 struct pcpu *pc;
1292 uint64_t rsp0;
1293 char *env;
1294 struct user_segment_descriptor *gdt;
1295 struct region_descriptor r_gdt;
1296 size_t kstack0_sz;
1297
1298 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1299
1300 kernphys = amd64_loadaddr();
1301
1302 physfree += kernphys;
1303
1304 kmdp = init_ops.parse_preload_data(modulep);
1305
1306 efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1307 MODINFOMD_EFI_MAP) != NULL;
1308
1309 if (!efi_boot) {
1310 /* Tell the bios to warmboot next time */
1311 atomic_store_short((u_short *)0x472, 0x1234);
1312 }
1313
1314 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1315 physfree = roundup2(physfree, PAGE_SIZE);
1316
1317 identify_cpu1();
1318 identify_hypervisor();
1319 identify_hypervisor_smbios();
1320 identify_cpu_fixup_bsp();
1321 identify_cpu2();
1322 initializecpucache();
1323
1324 /*
1325 * Check for pti, pcid, and invpcid before ifuncs are
1326 * resolved, to correctly select the implementation for
1327 * pmap_activate_sw_mode().
1328 */
1329 pti = pti_get_default();
1330 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1331 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1332 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1333 invpcid_works = (cpu_stdext_feature &
1334 CPUID_STDEXT_INVPCID) != 0;
1335 } else {
1336 pmap_pcid_enabled = 0;
1337 }
1338
1339 /*
1340 * Now we can do small core initialization, after the PCID
1341 * CPU features and user knobs are evaluated.
1342 */
1343 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1344 &pmap_pcid_invlpg_workaround_uena);
1345 cpu_init_small_core();
1346
1347 if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1348 use_xsave = 1;
1349 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1350 }
1351
1352 link_elf_ireloc(kmdp);
1353
1354 /*
1355 * This may be done better later if it gets more high level
1356 * components in it. If so just link td->td_proc here.
1357 */
1358 proc_linkup0(&proc0, &thread0);
1359
1360 /* Init basic tunables, hz etc */
1361 init_param1();
1362
1363 thread0.td_kstack = physfree - kernphys + KERNSTART;
1364 thread0.td_kstack_pages = kstack_pages;
1365 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1366 bzero((void *)thread0.td_kstack, kstack0_sz);
1367 physfree += kstack0_sz;
1368
1369 /*
1370 * Initialize enough of thread0 for delayed invalidation to
1371 * work very early. Rely on thread0.td_base_pri
1372 * zero-initialization, it is reset to PVM at proc0_init().
1373 */
1374 pmap_thread_init_invl_gen(&thread0);
1375
1376 pc = &temp_bsp_pcpu;
1377 pcpu_init(pc, 0, sizeof(struct pcpu));
1378 gdt = &temp_bsp_pcpu.pc_gdt[0];
1379
1380 /*
1381 * make gdt memory segments
1382 */
1383 for (x = 0; x < NGDT; x++) {
1384 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1385 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
1386 ssdtosd(&gdt_segs[x], &gdt[x]);
1387 }
1388 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1389 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1390 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1391
1392 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1393 r_gdt.rd_base = (long)gdt;
1394 lgdt(&r_gdt);
1395
1396 wrmsr(MSR_FSBASE, 0); /* User value */
1397 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1398 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1399
1400 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1401 physfree += DPCPU_SIZE;
1402 amd64_bsp_pcpu_init1(pc);
1403 /* Non-late cninit() and printf() can be moved up to here. */
1404
1405 /*
1406 * Initialize mutexes.
1407 *
1408 * icu_lock: in order to allow an interrupt to occur in a critical
1409 * section, to set pcpu->ipending (etc...) properly, we
1410 * must be able to get the icu lock, so it can't be
1411 * under witness.
1412 */
1413 mutex_init();
1414 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1415 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1416
1417 /* exceptions */
1418 for (x = 0; x < NIDT; x++)
1419 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1420 SEL_KPL, 0);
1421 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1422 SEL_KPL, 0);
1423 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1424 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1425 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1426 SEL_UPL, 0);
1427 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1428 SEL_UPL, 0);
1429 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1430 SEL_KPL, 0);
1431 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1432 SEL_KPL, 0);
1433 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1434 SEL_KPL, 0);
1435 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1436 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1437 SDT_SYSIGT, SEL_KPL, 0);
1438 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1439 SEL_KPL, 0);
1440 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1441 SDT_SYSIGT, SEL_KPL, 0);
1442 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1443 SEL_KPL, 0);
1444 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1445 SEL_KPL, 0);
1446 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1447 SEL_KPL, 0);
1448 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1449 SEL_KPL, 0);
1450 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1451 SEL_KPL, 0);
1452 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1453 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1454 SEL_KPL, 0);
1455 #ifdef KDTRACE_HOOKS
1456 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1457 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1458 #endif
1459 #ifdef XENHVM
1460 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1461 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1462 #endif
1463 r_idt.rd_limit = sizeof(idt0) - 1;
1464 r_idt.rd_base = (long) idt;
1465 lidt(&r_idt);
1466
1467 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1468 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1469
1470 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1471 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1472
1473 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1474 &syscall_ret_l1d_flush_mode);
1475
1476 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1477 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1478
1479 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1480
1481 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1482 &x86_rngds_mitg_enable);
1483
1484 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1485 &zenbleed_enable);
1486 zenbleed_sanitize_enable();
1487
1488 finishidentcpu(); /* Final stage of CPU initialization */
1489
1490 invlpgb_works = (amd_extended_feature_extensions &
1491 AMDFEID_INVLPGB) != 0;
1492 TUNABLE_INT_FETCH("vm.pmap.invlpgb_works", &invlpgb_works);
1493 if (invlpgb_works)
1494 invlpgb_maxcnt = cpu_procinfo3 & AMDID_INVLPGB_MAXCNT;
1495
1496 /*
1497 * Initialize the clock before the console so that console
1498 * initialization can use DELAY().
1499 */
1500 clock_init();
1501
1502 initializecpu(); /* Initialize CPU registers */
1503
1504 amd64_bsp_ist_init(pc);
1505
1506 /* Set the IO permission bitmap (empty due to tss seg limit) */
1507 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1508 IOPERM_BITMAP_SIZE;
1509
1510 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1511 ltr(gsel_tss);
1512
1513 amd64_conf_fast_syscall();
1514
1515 /*
1516 * We initialize the PCB pointer early so that exception
1517 * handlers will work. Also set up td_critnest to short-cut
1518 * the page fault handler.
1519 */
1520 cpu_max_ext_state_size = sizeof(struct savefpu);
1521 set_top_of_stack_td(&thread0);
1522 thread0.td_pcb = get_pcb_td(&thread0);
1523 thread0.td_critnest = 1;
1524
1525 /*
1526 * The console and kdb should be initialized even earlier than here,
1527 * but some console drivers don't work until after getmemsize().
1528 * Default to late console initialization to support these drivers.
1529 * This loses mainly printf()s in getmemsize() and early debugging.
1530 */
1531 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1532 if (!late_console) {
1533 cninit();
1534 amd64_kdb_init();
1535 }
1536
1537 getmemsize(kmdp, physfree);
1538 init_param2(physmem);
1539
1540 /* now running on new page tables, configured,and u/iom is accessible */
1541
1542 #ifdef DEV_PCI
1543 /* This call might adjust phys_avail[]. */
1544 pci_early_quirks();
1545 #endif
1546
1547 if (late_console)
1548 cninit();
1549
1550 /*
1551 * Dump the boot metadata. We have to wait for cninit() since console
1552 * output is required. If it's grossly incorrect the kernel will never
1553 * make it this far.
1554 */
1555 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1556 preload_dump();
1557
1558 #ifdef DEV_ISA
1559 #ifdef DEV_ATPIC
1560 elcr_probe();
1561 atpic_startup();
1562 #else
1563 /* Reset and mask the atpics and leave them shut down. */
1564 atpic_reset();
1565
1566 /*
1567 * Point the ICU spurious interrupt vectors at the APIC spurious
1568 * interrupt handler.
1569 */
1570 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1571 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1572 #endif
1573 #else
1574 #error "have you forgotten the isa device?"
1575 #endif
1576
1577 if (late_console)
1578 amd64_kdb_init();
1579
1580 msgbufinit(msgbufp, msgbufsize);
1581 fpuinit();
1582
1583 /* make an initial tss so cpu can get interrupt stack on syscall! */
1584 rsp0 = thread0.td_md.md_stack_base;
1585 /* Ensure the stack is aligned to 16 bytes */
1586 rsp0 &= ~0xFul;
1587 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1588 amd64_bsp_pcpu_init2(rsp0);
1589
1590 /* transfer to user mode */
1591
1592 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1593 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1594 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1595 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1596 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1597
1598 load_ds(_udatasel);
1599 load_es(_udatasel);
1600 load_fs(_ufssel);
1601
1602 /* setup proc 0's pcb */
1603 thread0.td_pcb->pcb_flags = 0;
1604
1605 env = kern_getenv("kernelname");
1606 if (env != NULL)
1607 strlcpy(kernelname, env, sizeof(kernelname));
1608
1609 kcsan_cpu_init(0);
1610
1611 #ifdef FDT
1612 x86_init_fdt();
1613 #endif
1614 thread0.td_critnest = 0;
1615
1616 kasan_init();
1617 kmsan_init();
1618
1619 TSEXIT();
1620
1621 /* Location of kernel stack for locore */
1622 return (thread0.td_md.md_stack_base);
1623 }
1624
1625 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1626 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1627 {
1628
1629 pcpu->pc_acpi_id = 0xffffffff;
1630 }
1631
1632 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1633 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1634 {
1635 struct bios_smap *smapbase;
1636 struct bios_smap_xattr smap;
1637 caddr_t kmdp;
1638 uint32_t *smapattr;
1639 int count, error, i;
1640
1641 /* Retrieve the system memory map from the loader. */
1642 kmdp = preload_search_by_type("elf kernel");
1643 if (kmdp == NULL)
1644 kmdp = preload_search_by_type("elf64 kernel");
1645 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1646 MODINFO_METADATA | MODINFOMD_SMAP);
1647 if (smapbase == NULL)
1648 return (0);
1649 smapattr = (uint32_t *)preload_search_info(kmdp,
1650 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1651 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1652 error = 0;
1653 for (i = 0; i < count; i++) {
1654 smap.base = smapbase[i].base;
1655 smap.length = smapbase[i].length;
1656 smap.type = smapbase[i].type;
1657 if (smapattr != NULL)
1658 smap.xattr = smapattr[i];
1659 else
1660 smap.xattr = 0;
1661 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1662 }
1663 return (error);
1664 }
1665 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1666 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1667 smap_sysctl_handler, "S,bios_smap_xattr",
1668 "Raw BIOS SMAP data");
1669
1670 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1671 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1672 {
1673 struct efi_map_header *efihdr;
1674 caddr_t kmdp;
1675 uint32_t efisize;
1676
1677 kmdp = preload_search_by_type("elf kernel");
1678 if (kmdp == NULL)
1679 kmdp = preload_search_by_type("elf64 kernel");
1680 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1681 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1682 if (efihdr == NULL)
1683 return (0);
1684 efisize = *((uint32_t *)efihdr - 1);
1685 return (SYSCTL_OUT(req, efihdr, efisize));
1686 }
1687 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1688 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1689 efi_map_sysctl_handler, "S,efi_map_header",
1690 "Raw EFI Memory Map");
1691
1692 static int
efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)1693 efi_arch_sysctl_handler(SYSCTL_HANDLER_ARGS)
1694 {
1695 char *arch;
1696 caddr_t kmdp;
1697
1698 kmdp = preload_search_by_type("elf kernel");
1699 if (kmdp == NULL)
1700 kmdp = preload_search_by_type("elf64 kernel");
1701
1702 arch = (char *)preload_search_info(kmdp,
1703 MODINFO_METADATA | MODINFOMD_EFI_ARCH);
1704 if (arch == NULL)
1705 return (0);
1706
1707 return (SYSCTL_OUT_STR(req, arch));
1708 }
1709 SYSCTL_PROC(_machdep, OID_AUTO, efi_arch,
1710 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1711 efi_arch_sysctl_handler, "A", "EFI Firmware Architecture");
1712
1713 void
spinlock_enter(void)1714 spinlock_enter(void)
1715 {
1716 struct thread *td;
1717 register_t flags;
1718
1719 td = curthread;
1720 if (td->td_md.md_spinlock_count == 0) {
1721 flags = intr_disable();
1722 td->td_md.md_spinlock_count = 1;
1723 td->td_md.md_saved_flags = flags;
1724 critical_enter();
1725 } else
1726 td->td_md.md_spinlock_count++;
1727 }
1728
1729 void
spinlock_exit(void)1730 spinlock_exit(void)
1731 {
1732 struct thread *td;
1733 register_t flags;
1734
1735 td = curthread;
1736 flags = td->td_md.md_saved_flags;
1737 td->td_md.md_spinlock_count--;
1738 if (td->td_md.md_spinlock_count == 0) {
1739 critical_exit();
1740 intr_restore(flags);
1741 }
1742 }
1743
1744 /*
1745 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1746 * we want to start a backtrace from the function that caused us to enter
1747 * the debugger. We have the context in the trapframe, but base the trace
1748 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1749 * enough for a backtrace.
1750 */
1751 void
makectx(struct trapframe * tf,struct pcb * pcb)1752 makectx(struct trapframe *tf, struct pcb *pcb)
1753 {
1754
1755 pcb->pcb_r12 = tf->tf_r12;
1756 pcb->pcb_r13 = tf->tf_r13;
1757 pcb->pcb_r14 = tf->tf_r14;
1758 pcb->pcb_r15 = tf->tf_r15;
1759 pcb->pcb_rbp = tf->tf_rbp;
1760 pcb->pcb_rbx = tf->tf_rbx;
1761 pcb->pcb_rip = tf->tf_rip;
1762 pcb->pcb_rsp = tf->tf_rsp;
1763 }
1764
1765 /*
1766 * The pcb_flags is only modified by current thread, or by other threads
1767 * when current thread is stopped. However, current thread may change it
1768 * from the interrupt context in cpu_switch(), or in the trap handler.
1769 * When we read-modify-write pcb_flags from C sources, compiler may generate
1770 * code that is not atomic regarding the interrupt handler. If a trap or
1771 * interrupt happens and any flag is modified from the handler, it can be
1772 * clobbered with the cached value later. Therefore, we implement setting
1773 * and clearing flags with single-instruction functions, which do not race
1774 * with possible modification of the flags from the trap or interrupt context,
1775 * because traps and interrupts are executed only on instruction boundary.
1776 */
1777 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1778 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1779 {
1780
1781 __asm __volatile("orl %1,%0"
1782 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1783 : "cc", "memory");
1784
1785 }
1786
1787 /*
1788 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1789 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1790 * pcb if user space modified the bases. We must save on the context
1791 * switch or if the return to usermode happens through the doreti.
1792 *
1793 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1794 * which have a consequence that the base MSRs must be saved each time
1795 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
1796 * context switches.
1797 */
1798 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1799 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1800 {
1801 register_t r;
1802
1803 if (curpcb == pcb &&
1804 (flags & PCB_FULL_IRET) != 0 &&
1805 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1806 r = intr_disable();
1807 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1808 if (rfs() == _ufssel)
1809 pcb->pcb_fsbase = rdfsbase();
1810 if (rgs() == _ugssel)
1811 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1812 }
1813 set_pcb_flags_raw(pcb, flags);
1814 intr_restore(r);
1815 } else {
1816 set_pcb_flags_raw(pcb, flags);
1817 }
1818 }
1819
1820 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1821 {
1822
1823 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1824 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1825 }
1826
1827 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1828 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1829 {
1830
1831 __asm __volatile("andl %1,%0"
1832 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1833 : "cc", "memory");
1834 }
1835
1836 #ifdef KDB
1837
1838 /*
1839 * Provide inb() and outb() as functions. They are normally only available as
1840 * inline functions, thus cannot be called from the debugger.
1841 */
1842
1843 /* silence compiler warnings */
1844 u_char inb_(u_short);
1845 void outb_(u_short, u_char);
1846
1847 u_char
inb_(u_short port)1848 inb_(u_short port)
1849 {
1850 return inb(port);
1851 }
1852
1853 void
outb_(u_short port,u_char data)1854 outb_(u_short port, u_char data)
1855 {
1856 outb(port, data);
1857 }
1858
1859 #endif /* KDB */
1860
1861 #undef memset
1862 #undef memmove
1863 #undef memcpy
1864
1865 void *memset_std(void *buf, int c, size_t len);
1866 void *memset_erms(void *buf, int c, size_t len);
1867 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1868 size_t len);
1869 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1870 size_t len);
1871 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1872 size_t len);
1873 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1874 size_t len);
1875
1876 #ifdef KCSAN
1877 /*
1878 * These fail to build as ifuncs when used with KCSAN.
1879 */
1880 void *
memset(void * buf,int c,size_t len)1881 memset(void *buf, int c, size_t len)
1882 {
1883
1884 return (memset_std(buf, c, len));
1885 }
1886
1887 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1888 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1889 {
1890
1891 return (memmove_std(dst, src, len));
1892 }
1893
1894 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1895 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1896 {
1897
1898 return (memcpy_std(dst, src, len));
1899 }
1900 #else
1901 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1902 {
1903
1904 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1905 memset_erms : memset_std);
1906 }
1907
1908 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1909 size_t))
1910 {
1911
1912 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1913 memmove_erms : memmove_std);
1914 }
1915
1916 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1917 {
1918
1919 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1920 memcpy_erms : memcpy_std);
1921 }
1922 #endif
1923
1924 void pagezero_std(void *addr);
1925 void pagezero_erms(void *addr);
1926 DEFINE_IFUNC(, void , pagezero, (void *))
1927 {
1928
1929 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1930 pagezero_erms : pagezero_std);
1931 }
1932