xref: /freebsd/sys/x86/x86/cpu_machdep.c (revision c5fda9bac0325eb8c5b447717862d279006f318f)
1 /*-
2  * Copyright (c) 2003 Peter Wemm.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * William Jolitz.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_atpic.h"
45 #include "opt_cpu.h"
46 #include "opt_ddb.h"
47 #include "opt_inet.h"
48 #include "opt_isa.h"
49 #include "opt_kdb.h"
50 #include "opt_kstack_pages.h"
51 #include "opt_maxmem.h"
52 #include "opt_mp_watchdog.h"
53 #include "opt_platform.h"
54 #ifdef __i386__
55 #include "opt_apic.h"
56 #endif
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bus.h>
62 #include <sys/cpu.h>
63 #include <sys/kdb.h>
64 #include <sys/kernel.h>
65 #include <sys/ktr.h>
66 #include <sys/lock.h>
67 #include <sys/malloc.h>
68 #include <sys/mutex.h>
69 #include <sys/pcpu.h>
70 #include <sys/rwlock.h>
71 #include <sys/sched.h>
72 #include <sys/smp.h>
73 #include <sys/sysctl.h>
74 
75 #include <machine/clock.h>
76 #include <machine/cpu.h>
77 #include <machine/cputypes.h>
78 #include <machine/specialreg.h>
79 #include <machine/md_var.h>
80 #include <machine/mp_watchdog.h>
81 #include <machine/tss.h>
82 #ifdef SMP
83 #include <machine/smp.h>
84 #endif
85 #ifdef CPU_ELAN
86 #include <machine/elan_mmcr.h>
87 #endif
88 #include <x86/acpica_machdep.h>
89 
90 #include <vm/vm.h>
91 #include <vm/vm_extern.h>
92 #include <vm/vm_kern.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_pager.h>
97 #include <vm/vm_param.h>
98 
99 #include <isa/isareg.h>
100 
101 #define	STATE_RUNNING	0x0
102 #define	STATE_MWAIT	0x1
103 #define	STATE_SLEEPING	0x2
104 
105 #ifdef SMP
106 static u_int	cpu_reset_proxyid;
107 static volatile u_int	cpu_reset_proxy_active;
108 #endif
109 
110 
111 /*
112  * Machine dependent boot() routine
113  *
114  * I haven't seen anything to put here yet
115  * Possibly some stuff might be grafted back here from boot()
116  */
117 void
118 cpu_boot(int howto)
119 {
120 }
121 
122 /*
123  * Flush the D-cache for non-DMA I/O so that the I-cache can
124  * be made coherent later.
125  */
126 void
127 cpu_flush_dcache(void *ptr, size_t len)
128 {
129 	/* Not applicable */
130 }
131 
132 void
133 acpi_cpu_c1(void)
134 {
135 
136 	__asm __volatile("sti; hlt");
137 }
138 
139 /*
140  * Use mwait to pause execution while waiting for an interrupt or
141  * another thread to signal that there is more work.
142  *
143  * NOTE: Interrupts will cause a wakeup; however, this function does
144  * not enable interrupt handling. The caller is responsible to enable
145  * interrupts.
146  */
147 void
148 acpi_cpu_idle_mwait(uint32_t mwait_hint)
149 {
150 	int *state;
151 	uint64_t v;
152 
153 	/*
154 	 * A comment in Linux patch claims that 'CPUs run faster with
155 	 * speculation protection disabled. All CPU threads in a core
156 	 * must disable speculation protection for it to be
157 	 * disabled. Disable it while we are idle so the other
158 	 * hyperthread can run fast.'
159 	 *
160 	 * XXXKIB.  Software coordination mode should be supported,
161 	 * but all Intel CPUs provide hardware coordination.
162 	 */
163 
164 	state = (int *)PCPU_PTR(monitorbuf);
165 	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
166 	    ("cpu_mwait_cx: wrong monitorbuf state"));
167 	atomic_store_int(state, STATE_MWAIT);
168 	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
169 		v = rdmsr(MSR_IA32_SPEC_CTRL);
170 		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
171 		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
172 	} else {
173 		v = 0;
174 	}
175 	cpu_monitor(state, 0, 0);
176 	if (atomic_load_int(state) == STATE_MWAIT)
177 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
178 
179 	/*
180 	 * SSB cannot be disabled while we sleep, or rather, if it was
181 	 * disabled, the sysctl thread will bind to our cpu to tweak
182 	 * MSR.
183 	 */
184 	if (v != 0)
185 		wrmsr(MSR_IA32_SPEC_CTRL, v);
186 
187 	/*
188 	 * We should exit on any event that interrupts mwait, because
189 	 * that event might be a wanted interrupt.
190 	 */
191 	atomic_store_int(state, STATE_RUNNING);
192 }
193 
194 /* Get current clock frequency for the given cpu id. */
195 int
196 cpu_est_clockrate(int cpu_id, uint64_t *rate)
197 {
198 	uint64_t tsc1, tsc2;
199 	uint64_t acnt, mcnt, perf;
200 	register_t reg;
201 
202 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
203 		return (EINVAL);
204 #ifdef __i386__
205 	if ((cpu_feature & CPUID_TSC) == 0)
206 		return (EOPNOTSUPP);
207 #endif
208 
209 	/*
210 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
211 	 * DELAY(9) based logic fails.
212 	 */
213 	if (tsc_is_invariant && !tsc_perf_stat)
214 		return (EOPNOTSUPP);
215 
216 #ifdef SMP
217 	if (smp_cpus > 1) {
218 		/* Schedule ourselves on the indicated cpu. */
219 		thread_lock(curthread);
220 		sched_bind(curthread, cpu_id);
221 		thread_unlock(curthread);
222 	}
223 #endif
224 
225 	/* Calibrate by measuring a short delay. */
226 	reg = intr_disable();
227 	if (tsc_is_invariant) {
228 		wrmsr(MSR_MPERF, 0);
229 		wrmsr(MSR_APERF, 0);
230 		tsc1 = rdtsc();
231 		DELAY(1000);
232 		mcnt = rdmsr(MSR_MPERF);
233 		acnt = rdmsr(MSR_APERF);
234 		tsc2 = rdtsc();
235 		intr_restore(reg);
236 		perf = 1000 * acnt / mcnt;
237 		*rate = (tsc2 - tsc1) * perf;
238 	} else {
239 		tsc1 = rdtsc();
240 		DELAY(1000);
241 		tsc2 = rdtsc();
242 		intr_restore(reg);
243 		*rate = (tsc2 - tsc1) * 1000;
244 	}
245 
246 #ifdef SMP
247 	if (smp_cpus > 1) {
248 		thread_lock(curthread);
249 		sched_unbind(curthread);
250 		thread_unlock(curthread);
251 	}
252 #endif
253 
254 	return (0);
255 }
256 
257 /*
258  * Shutdown the CPU as much as possible
259  */
260 void
261 cpu_halt(void)
262 {
263 	for (;;)
264 		halt();
265 }
266 
267 static void
268 cpu_reset_real(void)
269 {
270 	struct region_descriptor null_idt;
271 	int b;
272 
273 	disable_intr();
274 #ifdef CPU_ELAN
275 	if (elan_mmcr != NULL)
276 		elan_mmcr->RESCFG = 1;
277 #endif
278 #ifdef __i386__
279 	if (cpu == CPU_GEODE1100) {
280 		/* Attempt Geode's own reset */
281 		outl(0xcf8, 0x80009044ul);
282 		outl(0xcfc, 0xf);
283 	}
284 #endif
285 #if !defined(BROKEN_KEYBOARD_RESET)
286 	/*
287 	 * Attempt to do a CPU reset via the keyboard controller,
288 	 * do not turn off GateA20, as any machine that fails
289 	 * to do the reset here would then end up in no man's land.
290 	 */
291 	outb(IO_KBD + 4, 0xFE);
292 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
293 #endif
294 
295 	/*
296 	 * Attempt to force a reset via the Reset Control register at
297 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
298 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
299 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
300 	 * "hard" reset.  We try a "hard" reset.  The first write sets
301 	 * bit 1 to select a "hard" reset and clears bit 2.  The
302 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
303 	 * a reset.
304 	 */
305 	outb(0xcf9, 0x2);
306 	outb(0xcf9, 0x6);
307 	DELAY(500000);  /* wait 0.5 sec to see if that did it */
308 
309 	/*
310 	 * Attempt to force a reset via the Fast A20 and Init register
311 	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
312 	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
313 	 * preserve bit 1 while setting bit 0.  We also must clear bit
314 	 * 0 before setting it if it isn't already clear.
315 	 */
316 	b = inb(0x92);
317 	if (b != 0xff) {
318 		if ((b & 0x1) != 0)
319 			outb(0x92, b & 0xfe);
320 		outb(0x92, b | 0x1);
321 		DELAY(500000);  /* wait 0.5 sec to see if that did it */
322 	}
323 
324 	printf("No known reset method worked, attempting CPU shutdown\n");
325 	DELAY(1000000); /* wait 1 sec for printf to complete */
326 
327 	/* Wipe the IDT. */
328 	null_idt.rd_limit = 0;
329 	null_idt.rd_base = 0;
330 	lidt(&null_idt);
331 
332 	/* "good night, sweet prince .... <THUNK!>" */
333 	breakpoint();
334 
335 	/* NOTREACHED */
336 	while(1);
337 }
338 
339 #ifdef SMP
340 static void
341 cpu_reset_proxy(void)
342 {
343 
344 	cpu_reset_proxy_active = 1;
345 	while (cpu_reset_proxy_active == 1)
346 		ia32_pause(); /* Wait for other cpu to see that we've started */
347 
348 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
349 	DELAY(1000000);
350 	cpu_reset_real();
351 }
352 #endif
353 
354 void
355 cpu_reset(void)
356 {
357 #ifdef SMP
358 	cpuset_t map;
359 	u_int cnt;
360 
361 	if (smp_started) {
362 		map = all_cpus;
363 		CPU_CLR(PCPU_GET(cpuid), &map);
364 		CPU_NAND(&map, &stopped_cpus);
365 		if (!CPU_EMPTY(&map)) {
366 			printf("cpu_reset: Stopping other CPUs\n");
367 			stop_cpus(map);
368 		}
369 
370 		if (PCPU_GET(cpuid) != 0) {
371 			cpu_reset_proxyid = PCPU_GET(cpuid);
372 			cpustop_restartfunc = cpu_reset_proxy;
373 			cpu_reset_proxy_active = 0;
374 			printf("cpu_reset: Restarting BSP\n");
375 
376 			/* Restart CPU #0. */
377 			CPU_SETOF(0, &started_cpus);
378 			wmb();
379 
380 			cnt = 0;
381 			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
382 				ia32_pause();
383 				cnt++;	/* Wait for BSP to announce restart */
384 			}
385 			if (cpu_reset_proxy_active == 0) {
386 				printf("cpu_reset: Failed to restart BSP\n");
387 			} else {
388 				cpu_reset_proxy_active = 2;
389 				while (1)
390 					ia32_pause();
391 				/* NOTREACHED */
392 			}
393 		}
394 
395 		DELAY(1000000);
396 	}
397 #endif
398 	cpu_reset_real();
399 	/* NOTREACHED */
400 }
401 
402 bool
403 cpu_mwait_usable(void)
404 {
405 
406 	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
407 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
408 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
409 }
410 
411 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
412 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
413 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
414 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
415     0, "Use MONITOR/MWAIT for short idle");
416 
417 static void
418 cpu_idle_acpi(sbintime_t sbt)
419 {
420 	int *state;
421 
422 	state = (int *)PCPU_PTR(monitorbuf);
423 	atomic_store_int(state, STATE_SLEEPING);
424 
425 	/* See comments in cpu_idle_hlt(). */
426 	disable_intr();
427 	if (sched_runnable())
428 		enable_intr();
429 	else if (cpu_idle_hook)
430 		cpu_idle_hook(sbt);
431 	else
432 		acpi_cpu_c1();
433 	atomic_store_int(state, STATE_RUNNING);
434 }
435 
436 static void
437 cpu_idle_hlt(sbintime_t sbt)
438 {
439 	int *state;
440 
441 	state = (int *)PCPU_PTR(monitorbuf);
442 	atomic_store_int(state, STATE_SLEEPING);
443 
444 	/*
445 	 * Since we may be in a critical section from cpu_idle(), if
446 	 * an interrupt fires during that critical section we may have
447 	 * a pending preemption.  If the CPU halts, then that thread
448 	 * may not execute until a later interrupt awakens the CPU.
449 	 * To handle this race, check for a runnable thread after
450 	 * disabling interrupts and immediately return if one is
451 	 * found.  Also, we must absolutely guarentee that hlt is
452 	 * the next instruction after sti.  This ensures that any
453 	 * interrupt that fires after the call to disable_intr() will
454 	 * immediately awaken the CPU from hlt.  Finally, please note
455 	 * that on x86 this works fine because of interrupts enabled only
456 	 * after the instruction following sti takes place, while IF is set
457 	 * to 1 immediately, allowing hlt instruction to acknowledge the
458 	 * interrupt.
459 	 */
460 	disable_intr();
461 	if (sched_runnable())
462 		enable_intr();
463 	else
464 		acpi_cpu_c1();
465 	atomic_store_int(state, STATE_RUNNING);
466 }
467 
468 static void
469 cpu_idle_mwait(sbintime_t sbt)
470 {
471 	int *state;
472 
473 	state = (int *)PCPU_PTR(monitorbuf);
474 	atomic_store_int(state, STATE_MWAIT);
475 
476 	/* See comments in cpu_idle_hlt(). */
477 	disable_intr();
478 	if (sched_runnable()) {
479 		atomic_store_int(state, STATE_RUNNING);
480 		enable_intr();
481 		return;
482 	}
483 
484 	cpu_monitor(state, 0, 0);
485 	if (atomic_load_int(state) == STATE_MWAIT)
486 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
487 	else
488 		enable_intr();
489 	atomic_store_int(state, STATE_RUNNING);
490 }
491 
492 static void
493 cpu_idle_spin(sbintime_t sbt)
494 {
495 	int *state;
496 	int i;
497 
498 	state = (int *)PCPU_PTR(monitorbuf);
499 	atomic_store_int(state, STATE_RUNNING);
500 
501 	/*
502 	 * The sched_runnable() call is racy but as long as there is
503 	 * a loop missing it one time will have just a little impact if any
504 	 * (and it is much better than missing the check at all).
505 	 */
506 	for (i = 0; i < 1000; i++) {
507 		if (sched_runnable())
508 			return;
509 		cpu_spinwait();
510 	}
511 }
512 
513 /*
514  * C1E renders the local APIC timer dead, so we disable it by
515  * reading the Interrupt Pending Message register and clearing
516  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
517  *
518  * Reference:
519  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
520  *   #32559 revision 3.00+
521  */
522 #define	MSR_AMDK8_IPM		0xc0010055
523 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
524 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
525 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
526 
527 void
528 cpu_probe_amdc1e(void)
529 {
530 
531 	/*
532 	 * Detect the presence of C1E capability mostly on latest
533 	 * dual-cores (or future) k8 family.
534 	 */
535 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
536 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
537 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
538 		cpu_ident_amdc1e = 1;
539 	}
540 }
541 
542 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
543 
544 void
545 cpu_idle(int busy)
546 {
547 	uint64_t msr;
548 	sbintime_t sbt = -1;
549 
550 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
551 	    busy, curcpu);
552 #ifdef MP_WATCHDOG
553 	ap_watchdog(PCPU_GET(cpuid));
554 #endif
555 
556 	/* If we are busy - try to use fast methods. */
557 	if (busy) {
558 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
559 			cpu_idle_mwait(busy);
560 			goto out;
561 		}
562 	}
563 
564 	/* If we have time - switch timers into idle mode. */
565 	if (!busy) {
566 		critical_enter();
567 		sbt = cpu_idleclock();
568 	}
569 
570 	/* Apply AMD APIC timer C1E workaround. */
571 	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
572 		msr = rdmsr(MSR_AMDK8_IPM);
573 		if (msr & AMDK8_CMPHALT)
574 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
575 	}
576 
577 	/* Call main idle method. */
578 	cpu_idle_fn(sbt);
579 
580 	/* Switch timers back into active mode. */
581 	if (!busy) {
582 		cpu_activeclock();
583 		critical_exit();
584 	}
585 out:
586 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
587 	    busy, curcpu);
588 }
589 
590 static int cpu_idle_apl31_workaround;
591 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
592     &cpu_idle_apl31_workaround, 0,
593     "Apollo Lake APL31 MWAIT bug workaround");
594 
595 int
596 cpu_idle_wakeup(int cpu)
597 {
598 	int *state;
599 
600 	state = (int *)pcpu_find(cpu)->pc_monitorbuf;
601 	switch (atomic_load_int(state)) {
602 	case STATE_SLEEPING:
603 		return (0);
604 	case STATE_MWAIT:
605 		atomic_store_int(state, STATE_RUNNING);
606 		return (cpu_idle_apl31_workaround ? 0 : 1);
607 	case STATE_RUNNING:
608 		return (1);
609 	default:
610 		panic("bad monitor state");
611 		return (1);
612 	}
613 }
614 
615 /*
616  * Ordered by speed/power consumption.
617  */
618 static struct {
619 	void	*id_fn;
620 	char	*id_name;
621 	int	id_cpuid2_flag;
622 } idle_tbl[] = {
623 	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
624 	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
625 	    .id_cpuid2_flag = CPUID2_MON },
626 	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
627 	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
628 };
629 
630 static int
631 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
632 {
633 	char *avail, *p;
634 	int error;
635 	int i;
636 
637 	avail = malloc(256, M_TEMP, M_WAITOK);
638 	p = avail;
639 	for (i = 0; i < nitems(idle_tbl); i++) {
640 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
641 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
642 			continue;
643 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
644 		    cpu_idle_hook == NULL)
645 			continue;
646 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
647 		    idle_tbl[i].id_name);
648 	}
649 	error = sysctl_handle_string(oidp, avail, 0, req);
650 	free(avail, M_TEMP);
651 	return (error);
652 }
653 
654 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
655     0, 0, idle_sysctl_available, "A", "list of available idle functions");
656 
657 static bool
658 cpu_idle_selector(const char *new_idle_name)
659 {
660 	int i;
661 
662 	for (i = 0; i < nitems(idle_tbl); i++) {
663 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
664 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
665 			continue;
666 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
667 		    cpu_idle_hook == NULL)
668 			continue;
669 		if (strcmp(idle_tbl[i].id_name, new_idle_name))
670 			continue;
671 		cpu_idle_fn = idle_tbl[i].id_fn;
672 		if (bootverbose)
673 			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
674 		return (true);
675 	}
676 	return (false);
677 }
678 
679 static int
680 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
681 {
682 	char buf[16], *p;
683 	int error, i;
684 
685 	p = "unknown";
686 	for (i = 0; i < nitems(idle_tbl); i++) {
687 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
688 			p = idle_tbl[i].id_name;
689 			break;
690 		}
691 	}
692 	strncpy(buf, p, sizeof(buf));
693 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
694 	if (error != 0 || req->newptr == NULL)
695 		return (error);
696 	return (cpu_idle_selector(buf) ? 0 : EINVAL);
697 }
698 
699 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
700     cpu_idle_sysctl, "A", "currently selected idle function");
701 
702 static void
703 cpu_idle_tun(void *unused __unused)
704 {
705 	char tunvar[16];
706 
707 	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
708 		cpu_idle_selector(tunvar);
709 	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
710 		/*
711 		 * Apollo Lake errata APL31 (public errata APL30).
712 		 * Stores to the armed address range may not trigger
713 		 * MWAIT to resume execution.  OS needs to use
714 		 * interrupts to wake processors from MWAIT-induced
715 		 * sleep states.
716 		 */
717 		cpu_idle_apl31_workaround = 1;
718 	}
719 	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
720 }
721 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
722 
723 static int panic_on_nmi = 1;
724 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
725     &panic_on_nmi, 0,
726     "Panic on NMI raised by hardware failure");
727 int nmi_is_broadcast = 1;
728 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
729     &nmi_is_broadcast, 0,
730     "Chipset NMI is broadcast");
731 #ifdef KDB
732 int kdb_on_nmi = 1;
733 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
734     &kdb_on_nmi, 0,
735     "Go to KDB on NMI with unknown source");
736 #endif
737 
738 void
739 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
740 {
741 	bool claimed = false;
742 
743 #ifdef DEV_ISA
744 	/* machine/parity/power fail/"kitchen sink" faults */
745 	if (isa_nmi(frame->tf_err)) {
746 		claimed = true;
747 		if (panic_on_nmi)
748 			panic("NMI indicates hardware failure");
749 	}
750 #endif /* DEV_ISA */
751 #ifdef KDB
752 	if (!claimed && kdb_on_nmi) {
753 		/*
754 		 * NMI can be hooked up to a pushbutton for debugging.
755 		 */
756 		printf("NMI/cpu%d ... going to debugger\n", cpu);
757 		kdb_trap(type, 0, frame);
758 	}
759 #endif /* KDB */
760 }
761 
762 void
763 nmi_handle_intr(u_int type, struct trapframe *frame)
764 {
765 
766 #ifdef SMP
767 	if (nmi_is_broadcast) {
768 		nmi_call_kdb_smp(type, frame);
769 		return;
770 	}
771 #endif
772 	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
773 }
774 
775 int hw_ibrs_active;
776 int hw_ibrs_disable = 1;
777 
778 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
779     "Indirect Branch Restricted Speculation active");
780 
781 void
782 hw_ibrs_recalculate(void)
783 {
784 	uint64_t v;
785 
786 	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
787 		if (hw_ibrs_disable) {
788 			v = rdmsr(MSR_IA32_SPEC_CTRL);
789 			v &= ~(uint64_t)IA32_SPEC_CTRL_IBRS;
790 			wrmsr(MSR_IA32_SPEC_CTRL, v);
791 		} else {
792 			v = rdmsr(MSR_IA32_SPEC_CTRL);
793 			v |= IA32_SPEC_CTRL_IBRS;
794 			wrmsr(MSR_IA32_SPEC_CTRL, v);
795 		}
796 		return;
797 	}
798 	hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
799 	    !hw_ibrs_disable;
800 }
801 
802 static int
803 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
804 {
805 	int error, val;
806 
807 	val = hw_ibrs_disable;
808 	error = sysctl_handle_int(oidp, &val, 0, req);
809 	if (error != 0 || req->newptr == NULL)
810 		return (error);
811 	hw_ibrs_disable = val != 0;
812 	hw_ibrs_recalculate();
813 	return (0);
814 }
815 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
816     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
817     "Disable Indirect Branch Restricted Speculation");
818 
819 int hw_ssb_active;
820 int hw_ssb_disable;
821 
822 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
823     &hw_ssb_active, 0,
824     "Speculative Store Bypass Disable active");
825 
826 static void
827 hw_ssb_set_one(bool enable)
828 {
829 	uint64_t v;
830 
831 	v = rdmsr(MSR_IA32_SPEC_CTRL);
832 	if (enable)
833 		v |= (uint64_t)IA32_SPEC_CTRL_SSBD;
834 	else
835 		v &= ~(uint64_t)IA32_SPEC_CTRL_SSBD;
836 	wrmsr(MSR_IA32_SPEC_CTRL, v);
837 }
838 
839 static void
840 hw_ssb_set(bool enable, bool for_all_cpus)
841 {
842 	struct thread *td;
843 	int bound_cpu, i, is_bound;
844 
845 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
846 		hw_ssb_active = 0;
847 		return;
848 	}
849 	hw_ssb_active = enable;
850 	if (for_all_cpus) {
851 		td = curthread;
852 		thread_lock(td);
853 		is_bound = sched_is_bound(td);
854 		bound_cpu = td->td_oncpu;
855 		CPU_FOREACH(i) {
856 			sched_bind(td, i);
857 			hw_ssb_set_one(enable);
858 		}
859 		if (is_bound)
860 			sched_bind(td, bound_cpu);
861 		else
862 			sched_unbind(td);
863 		thread_unlock(td);
864 	} else {
865 		hw_ssb_set_one(enable);
866 	}
867 }
868 
869 void
870 hw_ssb_recalculate(bool all_cpus)
871 {
872 
873 	switch (hw_ssb_disable) {
874 	default:
875 		hw_ssb_disable = 0;
876 		/* FALLTHROUGH */
877 	case 0: /* off */
878 		hw_ssb_set(false, all_cpus);
879 		break;
880 	case 1: /* on */
881 		hw_ssb_set(true, all_cpus);
882 		break;
883 	case 2: /* auto */
884 		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSBD_NO) != 0 ?
885 		    false : true, all_cpus);
886 		break;
887 	}
888 }
889 
890 static int
891 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
892 {
893 	int error, val;
894 
895 	val = hw_ssb_disable;
896 	error = sysctl_handle_int(oidp, &val, 0, req);
897 	if (error != 0 || req->newptr == NULL)
898 		return (error);
899 	hw_ssb_disable = val;
900 	hw_ssb_recalculate(true);
901 	return (0);
902 }
903 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
904     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
905     hw_ssb_disable_handler, "I",
906     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
907 
908 /*
909  * Enable and restore kernel text write permissions.
910  * Callers must ensure that disable_wp()/restore_wp() are executed
911  * without rescheduling on the same core.
912  */
913 bool
914 disable_wp(void)
915 {
916 	u_int cr0;
917 
918 	cr0 = rcr0();
919 	if ((cr0 & CR0_WP) == 0)
920 		return (false);
921 	load_cr0(cr0 & ~CR0_WP);
922 	return (true);
923 }
924 
925 void
926 restore_wp(bool old_wp)
927 {
928 
929 	if (old_wp)
930 		load_cr0(rcr0() | CR0_WP);
931 }
932 
933